From b01513202b657719589bb6f92256a0be5717dbc4 Mon Sep 17 00:00:00 2001
From: Jason Zaman <jason@perfinion.com>
Date: Tue, 1 May 2018 19:55:53 +0800
Subject: [PATCH 001/610] pip_package: modularize build script to allow distros
 to install more flexibly

Gentoo Linux handles python modules slightly differently and packaging
wheels is complicated. We prefer to run setup.py directly ourselves
rather than build a wheel and then install from there.

This modularizes build_pip_package.sh to allow running parts separately.
using --src srcdir will prepare the package in a known dir so the distro
package can take it from there. If only dstdir is given (either with
--dst or as the only argument to preserve backwards compat) then
behaviour is the same as before, the sources are prepared and the wheel
is built and placed in dstdir.

Signed-off-by: Jason Zaman <jason@perfinion.com>
---
 .../tools/pip_package/build_pip_package.sh    | 160 +++++++++++++-----
 1 file changed, 115 insertions(+), 45 deletions(-)

diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 1a83c6e757..41e714b1c1 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -41,51 +41,15 @@ function is_windows() {
   fi
 }
 
-function main() {
+function prepare_src() {
   if [ $# -lt 1 ] ; then
     echo "No destination dir provided"
     exit 1
   fi
 
-  DEST=$(real_path $1)
-  TMPDIR=$(mktemp -d -t tmp.XXXXXXXXXX)
-
-  PKG_NAME_FLAG=""
-  GPU_BUILD=0
-  NIGHTLY_BUILD=0
-  PROJECT_NAME=""
-  while true; do
-    if [[ "$1" == "--nightly_flag" ]]; then
-      NIGHTLY_BUILD=1
-    elif [[ "$1" == "--gpu" ]]; then
-      GPU_BUILD=1
-    elif [[ "$1" == "--gpudirect" ]]; then
-      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
-    elif [[ "$1" == "--project_name" ]]; then
-      shift
-      if [[ -z "$1" ]]; then
-        break
-      fi
-      PROJECT_NAME="$1"
-    fi
-    shift
-
-    if [[ -z "$1" ]]; then
-      break
-    fi
-  done
-
-  if [[ -n ${PROJECT_NAME} ]]; then
-    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
-  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
-  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tf_nightly"
-  elif [[ ${GPU_BUILD} == "1" ]]; then
-    PKG_NAME_FLAG="--project_name tensorflow_gpu"
-  fi
-
-  echo $(date) : "=== Using tmpdir: ${TMPDIR}"
+  TMPDIR="$1"
+  mkdir -p "$TMPDIR"
+  echo $(date) : "=== Preparing sources in dir: ${TMPDIR}"
 
   if [ ! -d bazel-bin/tensorflow ]; then
     echo "Could not find bazel-bin.  Did you run from the root of the build tree?"
@@ -157,17 +121,28 @@ function main() {
   # over so user defined ops can be compiled.
   mkdir -p ${TMPDIR}/google
   mkdir -p ${TMPDIR}/third_party
-  pushd ${RUNFILES%org_tensorflow}
+  pushd ${RUNFILES%org_tensorflow} > /dev/null
   for header in $(find protobuf_archive -name \*.h); do
     mkdir -p "${TMPDIR}/google/$(dirname ${header})"
     cp "$header" "${TMPDIR}/google/$(dirname ${header})/"
   done
-  popd
+  popd > /dev/null
   cp -R $RUNFILES/third_party/eigen3 ${TMPDIR}/third_party
 
   cp tensorflow/tools/pip_package/MANIFEST.in ${TMPDIR}
   cp tensorflow/tools/pip_package/README ${TMPDIR}
   cp tensorflow/tools/pip_package/setup.py ${TMPDIR}
+}
+
+function build_wheel() {
+  if [ $# -lt 2 ] ; then
+    echo "No src and dest dir provided"
+    exit 1
+  fi
+
+  TMPDIR="$1"
+  DEST="$2"
+  PKG_NAME_FLAG="$3"
 
   # Before we leave the top-level directory, make sure we know how to
   # call python.
@@ -175,15 +150,110 @@ function main() {
     source tools/python_bin_path.sh
   fi
 
-  pushd ${TMPDIR}
+  pushd ${TMPDIR} > /dev/null
   rm -f MANIFEST
   echo $(date) : "=== Building wheel"
   "${PYTHON_BIN_PATH:-python}" setup.py bdist_wheel ${PKG_NAME_FLAG} >/dev/null
   mkdir -p ${DEST}
   cp dist/* ${DEST}
-  popd
-  rm -rf ${TMPDIR}
+  popd > /dev/null
   echo $(date) : "=== Output wheel file is in: ${DEST}"
 }
 
+function usage() {
+  echo "Usage:"
+  echo "$0 [--src srcdir] [--dst dstdir] [options]"
+  echo "$0 dstdir [options]"
+  echo ""
+  echo "    --src                 prepare sources in srcdir"
+  echo "                              will use temporary dir if not specified"
+  echo ""
+  echo "    --dst                 build wheel in dstdir"
+  echo "                              if dstdir is not set do not build, only prepare sources"
+  echo ""
+  echo "  Options:"
+  echo "    --project_name <name> set project name to name"
+  echo "    --gpu                 build tensorflow_gpu"
+  echo "    --gpudirect           build tensorflow_gpudirect"
+  echo "    --nightly_flag        build tensorflow nightly"
+  echo ""
+  exit 1
+}
+
+function main() {
+  PKG_NAME_FLAG=""
+  PROJECT_NAME=""
+  GPU_BUILD=0
+  NIGHTLY_BUILD=0
+  SRCDIR=""
+  DSTDIR=""
+  CLEANSRC=1
+  while true; do
+    if [[ "$1" == "--help" ]]; then
+      usage
+      exit 1
+    elif [[ "$1" == "--nightly_flag" ]]; then
+      NIGHTLY_BUILD=1
+    elif [[ "$1" == "--gpu" ]]; then
+      GPU_BUILD=1
+    elif [[ "$1" == "--gpudirect" ]]; then
+      PKG_NAME_FLAG="--project_name tensorflow_gpudirect"
+    elif [[ "$1" == "--project_name" ]]; then
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+    elif [[ "$1" == "--src" ]]; then
+      shift
+      SRCDIR="$(real_path $1)"
+      CLEANSRC=0
+    elif [[ "$1" == "--dst" ]]; then
+      shift
+      DSTDIR="$(real_path $1)"
+    else
+      DSTDIR="$(real_path $1)"
+    fi
+    shift
+
+    if [[ -z "$1" ]]; then
+      break
+    fi
+  done
+
+  if [[ -z "$DSTDIR" ]] && [[ -z "$SRCDIR" ]]; then
+    echo "No destination dir provided"
+    usage
+    exit 1
+  fi
+
+  if [[ -z "$SRCDIR" ]]; then
+    # make temp srcdir if none set
+    SRCDIR="$(mktemp -d -t tmp.XXXXXXXXXX)"
+  fi
+
+  prepare_src "$SRCDIR"
+
+  if [[ -z "$DSTDIR" ]]; then
+      # only want to prepare sources
+      exit
+  fi
+
+  if [[ -n ${PROJECT_NAME} ]]; then
+    PKG_NAME_FLAG="--project_name ${PROJECT_NAME}"
+  elif [[ ${NIGHTLY_BUILD} == "1" && ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly_gpu"
+  elif [[ ${NIGHTLY_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tf_nightly"
+  elif [[ ${GPU_BUILD} == "1" ]]; then
+    PKG_NAME_FLAG="--project_name tensorflow_gpu"
+  fi
+
+  build_wheel "$SRCDIR" "$DSTDIR" "$PKG_NAME_FLAG"
+
+  if [[ $CLEANSRC -ne 0 ]]; then
+    rm -rf "${TMPDIR}"
+  fi
+}
+
 main "$@"
-- 
GitLab


From 418b5abda254f11ca54d0439893024c58e2af983 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 27 May 2018 18:43:32 +0000
Subject: [PATCH 002/610] Fix incorrect documentation for `tf.reduce_any`

This fix fixes the incorrect documentation for `tf.reduce_any`. The
previous description:
```
   If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
```

is not correct. See below:
```
Python 2.7.12 (default, Dec  4 2017, 14:50:18)
[GCC 5.4.0 20160609] on linux2
Type "help", "copyright", "credits" or "license" for more information.
>>> import tensorflow as tf
>>> x = tf.constant([[True,  True], [False, False]])
>>> v1 = tf.reduce_any(x, [])
>>> tf.Session().run(v1)
array([[ True,  True],
       [False, False]])
>>> v2 = tf.reduce_any(x, None)
>>> tf.Session().run(v2)
True
>>>
```

Instead, the correct description should be:
```
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 118b02c6c7..53d5edbf18 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1675,7 +1675,7 @@ def reduce_any(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
-- 
GitLab


From 564c146f37a02c3930a0dcc2978c9054664e927e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 27 May 2018 18:55:23 +0000
Subject: [PATCH 003/610] Fix incorrect documentation for `tf.reduce_all`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/math_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 53d5edbf18..b7e3de7e85 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -1617,7 +1617,7 @@ def reduce_all(input_tensor,
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` has no entries, all dimensions are reduced, and a
+  If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
   For example:
-- 
GitLab


From d0e31cd4b00f30f5ffb9753f5f1e79f8940b0734 Mon Sep 17 00:00:00 2001
From: "candy.dc" <dingchen.mail@gmail.com>
Date: Mon, 28 May 2018 16:53:59 +0800
Subject: [PATCH 004/610] Fix typo

---
 tensorflow/core/kernels/sparse_matmul_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index a1f9667b78..866c5dcd52 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -1490,7 +1490,7 @@ inline void LibxsmmSparseMatMul<TL, TR>::Compute(
 
 #endif  // TENSORFLOW_USE_LIBXSMM
 
-// Here is a an overview of the SparseMatMul code. Note that we assume that the
+// Here is an overview of the SparseMatMul code. Note that we assume that the
 // left matrix is sparse.
 //
 // The matrix "left" is divided into a grid with blocksize of (M, KL). Each
-- 
GitLab


From 69095610798ec7def94fc453dfeaff758e0ee9cd Mon Sep 17 00:00:00 2001
From: Jason Zaman <jason@perfinion.com>
Date: Mon, 28 May 2018 21:50:21 +0800
Subject: [PATCH 005/610] generate-pc.sh: add option to set libdir

Signed-off-by: Jason Zaman <jason@perfinion.com>
---
 tensorflow/c/generate-pc.sh | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/c/generate-pc.sh b/tensorflow/c/generate-pc.sh
index 02a6a58b61..7184ad68fb 100755
--- a/tensorflow/c/generate-pc.sh
+++ b/tensorflow/c/generate-pc.sh
@@ -15,10 +15,12 @@
 # ==============================================================================
 
 TF_PREFIX='/usr/local'
+LIBDIR='lib'
 
 usage() {
     echo "Usage: $0 OPTIONS"
     echo -e "-p, --prefix\tset installation prefix (default: /usr/local)"
+    echo -e "-l, --libdir\tset lib directory (default: lib)"
     echo -e "-v, --version\tset TensorFlow version"
     echo -e "-h, --help\tdisplay this message"
 }
@@ -26,7 +28,7 @@ usage() {
 [ $# == 0 ] && usage && exit 0
 
 # read the options
-ARGS=$(getopt -o p:v:h --long prefix:,version:,help -n $0 -- "$@")
+ARGS=$(getopt -o p:l:v:h --long prefix:,libdir:,version:,help -n $0 -- "$@")
 eval set -- "$ARGS"
 
 # extract options and their arguments into variables.
@@ -38,6 +40,11 @@ while true ; do
                 "") shift 2 ;;
                 *) TF_PREFIX=$2 ; shift 2 ;;
             esac ;;
+        -l|--libdir)
+            case "$2" in
+                "") shift 2 ;;
+                *) LIBDIR=$2 ; shift 2 ;;
+            esac ;;
         -v|--version)
             case "$2" in
                 "") shift 2 ;;
@@ -55,7 +62,7 @@ echo "Generating pkgconfig file for TensorFlow $TF_VERSION in $TF_PREFIX"
 cat << EOF > tensorflow.pc
 prefix=${TF_PREFIX}
 exec_prefix=\${prefix}
-libdir=\${exec_prefix}/lib
+libdir=\${exec_prefix}/${LIBDIR}
 includedir=\${prefix}/include
 
 Name: TensorFlow
-- 
GitLab


From d97695384baad9612e41715cbd7823908ee63bf6 Mon Sep 17 00:00:00 2001
From: Christoph Boeddeker <boeddeker@users.noreply.github.com>
Date: Tue, 29 May 2018 09:00:47 +0200
Subject: [PATCH 006/610] Add a note that stop_gradient in moments does not
 change the gradient

---
 tensorflow/python/ops/nn_impl.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index 783d485892..e2ef1f66b1 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -689,6 +689,9 @@ def moments(
     # Compute true mean while keeping the dims for proper broadcasting.
     mean = math_ops.reduce_mean(y, axes, keepdims=True, name="mean")
     # sample variance, not unbiased variance
+    # Note: stop_gradient does not change the gradient that gets 
+    #       backpropagated to the mean from the variance calculation,
+    #       because that gradient is zero
     variance = math_ops.reduce_mean(
         math_ops.squared_difference(y, array_ops.stop_gradient(mean)),
         axes,
-- 
GitLab


From 245725bd066e1f972b04676f46376050f804f986 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 19:04:55 +0000
Subject: [PATCH 007/610] Add support of string split behavior compatible with
 python's `str.split`

This fix tries to address the issue raised in 18271 where
the existing `tf.string_split` does not match python's `str.split`.
Specifically, the `tf.string_split` does not handle the case where
separator might be multi-char.

This fix adds the implementation of string split compatible with `str.split`.
In order to maintain backward-compatible, this fix exposes the new
implementation of `array_ops.string_split_v2` into `tf.strings.split`
namespace.

This fix fixes 18271.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/string_split_op.cc | 107 +++++++++++++++++++++
 1 file changed, 107 insertions(+)

diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index 4c2b312c34..aeaa562fe7 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 
 namespace tensorflow {
@@ -43,6 +44,46 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
+std::vector<string> SplitV2(const string& str, StringPiece sep) {
+  // This SplitV2 method matches the behavior of python's str.split:
+  //   If sep is given, consecutive delimiters are not grouped together
+  //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
+  //   returns ['1', '', '2']). The sep argument may consist of multiple
+  //   characters (for example, '1<>2<>3'.split('<>') returns ['1', '2', '3']).
+  //   Splitting an empty string with a specified separator returns [''].
+  //
+  //   If sep is not specified or is None, a different splitting algorithm is
+  //   applied: runs of consecutive whitespace are regarded as a single
+  //   separator, and the result will contain no empty strings at the start or
+  //   end if the string has leading or trailing whitespace. Consequently,
+  //   splitting an empty string or a string consisting of just whitespace
+  //   with a None separator returns [].
+
+  StringPiece text(str);
+
+  std::vector<string> result;
+  if (sep.empty()) {
+    StringPiece token;
+    // Remove leading whitespaces.
+    str_util::RemoveLeadingWhitespace(&text);
+    while (str_util::ConsumeNonWhitespace(&text, &token)) {
+      result.emplace_back(std::string(token));
+      str_util::RemoveLeadingWhitespace(&text);
+    }
+    return result;
+  }
+  auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  while (p != text.end()) {
+    StringPiece token = text.substr(0, p - text.begin());
+    result.emplace_back(std::string(token));
+    text.remove_prefix(token.size());
+    text.remove_prefix(sep.size());
+    p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  }
+  result.emplace_back(std::string(text));
+  return result;
+}
+
 }  // namespace
 
 class StringSplitOp : public OpKernel {
@@ -122,6 +163,72 @@ class StringSplitOp : public OpKernel {
   bool skip_empty_;
 };
 
+class StringSplitV2Op : public OpKernel {
+ public:
+  explicit StringSplitV2Op(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(input_tensor->shape()),
+                errors::InvalidArgument("input must be a vector, got shape: ",
+                                        input_tensor->shape().DebugString()));
+
+    const auto input_vec = input_tensor->vec<string>();
+    const int64 batch_size = input_vec.dimension(0);
+
+    const Tensor* sep_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("sep", &sep_tensor));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(sep_tensor->shape()),
+                errors::InvalidArgument("sep must be a scalar, got shape: ",
+                                        sep_tensor->shape().DebugString()));
+    const auto sep_vec = sep_tensor->flat<string>();
+    StringPiece sep(sep_vec(0));
+    std::vector<string> tokens;
+    // Guess that we'll be unpacking a handful of tokens per example.
+    static constexpr int kReserveSize = 4;
+    tokens.reserve(batch_size * kReserveSize);
+
+    int64 output_size = 0;
+    int64 max_num_entries = 0;
+    std::vector<int64> num_indices(batch_size);
+    for (int64 i = 0; i < batch_size; ++i) {
+      std::vector<string> parts = SplitV2(input_vec(i), sep);
+      int64 n_entries = parts.size();
+      num_indices[i] = n_entries;
+      output_size += n_entries;
+      max_num_entries = std::max(max_num_entries, n_entries);
+      tokens.insert(tokens.end(), parts.begin(), parts.end());
+    }
+
+    Tensor* sp_indices_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({output_size, 2}),
+                                             &sp_indices_t));
+    Tensor* sp_tokens_t;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output(1, TensorShape({output_size}), &sp_tokens_t));
+    Tensor* sp_shape_t;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(2, TensorShape({2}), &sp_shape_t));
+
+    auto sp_indices = sp_indices_t->matrix<int64>();
+    auto sp_tokens = sp_tokens_t->vec<string>();
+    auto sp_shape = sp_shape_t->vec<int64>();
+    sp_shape(0) = batch_size;
+    sp_shape(1) = max_num_entries;
+    size_t c = 0;
+    for (size_t i = 0; i < batch_size; ++i) {
+      for (size_t j = 0; j < num_indices[i]; ++j) {
+        sp_indices(c, 0) = i;
+        sp_indices(c, 1) = j;
+        sp_tokens(c) = tokens[c];
+        ++c;
+      }
+    }
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
+REGISTER_KERNEL_BUILDER(Name("StringSplitV2").Device(DEVICE_CPU),
+                        StringSplitV2Op);
 
 }  // namespace tensorflow
-- 
GitLab


From c5121973a96665c5e1420f73e571287f157fa8e3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 19:10:48 +0000
Subject: [PATCH 008/610] Expose StringSplitV2 ops to string_ops.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/string_ops.cc | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 1d5c743a56..d4d4a32236 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -134,6 +134,23 @@ REGISTER_OP("StringSplit")
       return Status::OK();
     });
 
+REGISTER_OP("StringSplitV2")
+    .Input("input: string")
+    .Input("sep: string")
+    .Output("indices: int64")
+    .Output("values: string")
+    .Output("shape: int64")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+
+      c->set_output(0, c->Matrix(InferenceContext::kUnknownDim, 2));
+      c->set_output(1, c->Vector(InferenceContext::kUnknownDim));
+      c->set_output(2, c->Vector(2));
+      return Status::OK();
+    });
+
 REGISTER_OP("StringStrip")
     .Input("input: string")
     .Output("output: string")
-- 
GitLab


From d24b52adff3675809aaa623b0c160a526cd1f12a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 13:06:57 -0700
Subject: [PATCH 009/610] Automated g4 rollback of changelist 198421828

PiperOrigin-RevId: 198444757
---
 .../compiler/jit/kernels/xla_launch_op.cc     |  2 +-
 .../compiler/jit/xla_compile_on_demand_op.cc  |  3 +-
 tensorflow/compiler/tf2xla/tf2xla.cc          |  3 +-
 tensorflow/compiler/tf2xla/xla_compiler.cc    | 65 ++-----------------
 tensorflow/compiler/tf2xla/xla_compiler.h     |  7 +-
 .../compiler/tf2xla/xla_compiler_test.cc      | 54 ++-------------
 6 files changed, 17 insertions(+), 117 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 902fe27acd..27287e0f96 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -148,7 +148,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 
   XlaCompiler::Options options;
   options.client = client;
-  options.device_type = cache->device_type();
+  options.device_type = &cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index b1943d3e1a..ab644ff5a6 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -151,7 +151,8 @@ Status XlaCompileOnDemandOp::Compile(
   core::ScopedUnref cache_ref(cache);
 
   XlaCompiler::Options options;
-  options.device_type = metadata.jit_device_type();
+  DeviceType device_type = metadata.jit_device_type();
+  options.device_type = &device_type;
   options.client = metadata.client();
   options.flib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index ac768b206e..3a08aa8cf4 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -263,7 +263,8 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
-  compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+  DeviceType device_type(DEVICE_CPU_XLA_JIT);
+  compiler_options.device_type = &device_type;
   compiler_options.flib_def = &graph->flib_def();
   compiler_options.graph_def_version = graph->versions().producer();
   compiler_options.allow_cpu_custom_calls = true;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index ccbc74eb31..f7098917b1 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -83,9 +83,12 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
       next_step_id_(1),
-      device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
+      device_(
+          new XlaCompilationDevice(SessionOptions(), *options_.device_type)),
       device_mgr_({device_}) {
-  CHECK(!options_.device_type.type_string().empty());
+  // We no longer need the device_type.
+  options_.device_type = nullptr;
+
   if (options_.populate_resource_manager) {
     initialization_status_ =
         (*options_.populate_resource_manager)(device_->resource_manager());
@@ -656,59 +659,6 @@ Status XlaCompiler::CompileSingleOp(
   return CompileGraph(options, name, std::move(graph), args, result);
 }
 
-namespace {
-
-// Check that the ops of all non-functional nodes have been registered.
-string ValidateFunctionDef(const FunctionDef* fdef,
-                           const FunctionLibraryDefinition& flib_def) {
-  std::vector<string> invalid_ops;
-  for (const NodeDef& node : fdef->node_def()) {
-    const string& op = node.op();
-    if (op == FunctionLibraryDefinition::kGradientOp || flib_def.Find(op)) {
-      continue;
-    }
-    const OpDef* op_def;
-    if (!OpRegistry::Global()->LookUpOpDef(op, &op_def).ok()) {
-      invalid_ops.push_back(op);
-    }
-  }
-  return tensorflow::str_util::Join(invalid_ops, ", ");
-}
-
-// Check that the graph doesn't have any nodes incompatible with given
-// device_type.
-Status ValidateGraph(const Graph* graph,
-                     const FunctionLibraryDefinition& flib_def,
-                     const DeviceType& device_type, const string& name) {
-  std::vector<string> invalid_ops;
-  for (const Node* node : graph->nodes()) {
-    if (node->type_string() == FunctionLibraryDefinition::kGradientOp) {
-      continue;
-    }
-    const FunctionDef* fdef = flib_def.Find(node->def().op());
-    if (fdef) {
-      string error_msg = ValidateFunctionDef(fdef, flib_def);
-      if (!error_msg.empty()) {
-        invalid_ops.push_back(
-            strings::StrCat(node->def().op(), ":{", error_msg, "}"));
-      }
-      continue;
-    }
-    if (!FindKernelDef(device_type, node->def(), nullptr, nullptr).ok()) {
-      invalid_ops.push_back(node->def().op());
-    }
-  }
-  if (!invalid_ops.empty()) {
-    return errors::InvalidArgument(strings::StrCat(
-        "Detected unsupported operations when trying to compile graph ", name,
-        " on ", device_type.type_string(), ":",
-        tensorflow::str_util::Join(invalid_ops, ", ")));
-  }
-  return Status::OK();
-}
-
-}  // namespace
-
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
@@ -731,11 +681,6 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       FunctionalizeControlFlow(flib_runtime_->GetFunctionLibraryDefinition(),
                                graph.get(), local_flib_def_.get()));
 
-  // Detect ops incompatible with the device_type.
-  // FunctionalizeControlFlow may remove some unsupported ops.
-  TF_RETURN_IF_ERROR(ValidateGraph(graph.get(), *options_.flib_def,
-                                   options_.device_type, name));
-
   xla::XlaBuilder builder(name);
   XlaContext* context = new XlaContext(
       this, &builder, options_.allow_cpu_custom_calls,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 76f4c4c1ea..bf496bd8bc 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -245,9 +244,9 @@ class XlaCompiler {
   typedef std::function<TensorShape(const TensorShape&, DataType)>
       ShapeRepresentationFn;
   struct Options {
-    // Name of the compilation device to use. It must be set by the caller.
-    // The default empty value is invalid.
-    DeviceType device_type = DeviceType("");
+    // Name of the compilation device to use. Needs to be live only during
+    // XlaCompiler's constructor.
+    const DeviceType* device_type = nullptr;
 
     xla::Client* client = nullptr;
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 246b386f38..55772ca324 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -45,6 +45,8 @@ namespace tensorflow {
 
 class XlaCompilerTest : public ::testing::Test {
  protected:
+  XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {}
+
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
 
@@ -56,7 +58,7 @@ class XlaCompilerTest : public ::testing::Test {
 
   XlaCompiler::Options DefaultOptions() {
     XlaCompiler::Options options;
-    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+    options.device_type = &cpu_device_type_;
     options.client = client_;
     options.flib_def = flib_def_.get();
     return options;
@@ -66,6 +68,7 @@ class XlaCompilerTest : public ::testing::Test {
     return compiler->local_flib_def_.get();
   }
 
+  DeviceType cpu_device_type_;
   xla::Client* client_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 };
@@ -976,54 +979,5 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
-// Tests a graph which has a function with an invalid op.
-TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
-  XlaCompiler compiler(DefaultOptions());
-
-  FunctionDefLibrary flib;
-  FunctionDef fn = FillFn();
-  NodeDef* node = fn.add_node_def();
-  node->set_name("Invalid");
-  node->set_op("InvalidOp"); /* unsupported op */
-  node = fn.add_node_def();
-  node->set_name("Switch");
-  node->set_op("Switch"); /* control flow node */
-  *flib.add_function() = fn;
-
-  TF_ASSERT_OK(flib_def_->AddFunctionDef(fn));
-
-  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
-
-  Scope scope = Scope::NewRootScope().ExitOnError();
-  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
-  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
-  TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(flib));
-
-  NodeDef def;
-  TF_ASSERT_OK(NodeDefBuilder("fill_fn", "FillFn", flib_def_.get())
-                   .Input(value.name(), 0, DT_INT32)
-                   .Input(shape.name(), 1, DT_INT32)
-                   .Finalize(&def));
-  Status status;
-  Node* fill = scope.graph()->AddNode(def, &status);
-  TF_ASSERT_OK(status);
-  TF_ASSERT_OK(scope.DoShapeInference(fill));
-  scope.graph()->AddEdge(value.node(), 0, fill, 0);
-  scope.graph()->AddEdge(shape.node(), 0, fill, 1);
-
-  auto retval = ops::_Retval(scope.WithOpName("retval"), Output(fill), 0);
-
-  TF_ASSERT_OK(scope.ToGraph(graph.get()));
-
-  std::vector<XlaCompiler::Argument> args;
-  XlaCompiler::CompilationResult result;
-  status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
-                                 std::move(graph), args, &result);
-  ASSERT_FALSE(status.ok());
-  EXPECT_TRUE(
-      str_util::StrContains(status.error_message(), "FillFn:{InvalidOp}"))
-      << status.error_message();
-}
-
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From d3152a33e4cbbf24eb01ec6369520400a16aafd0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 13:44:28 -0700
Subject: [PATCH 010/610] Make the quantize_and_dequantize op use the full
 quantized range when possible.

PiperOrigin-RevId: 198450816
---
 .../api_def_QuantizeAndDequantizeV2.pbtxt     | 77 ++++++++--------
 .../core/kernels/quantize_and_dequantize_op.h | 89 +++++++++----------
 .../quantize_and_dequantize_op_test.cc        | 46 +++++-----
 3 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index 1fc9c9034a..41a9cfaa27 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -9,21 +9,24 @@ END
   in_arg {
     name: "input_min"
     description: <<END
-If range_given, this is the min of the range, otherwise this input
-will be ignored.
+If `range_given == True`, this specifies the minimum input value that needs to
+be represented, otherwise it is determined from the min value of the `input`
+tensor.
 END
   }
   in_arg {
     name: "input_max"
     description: <<END
-If range_given, this is the max of the range, otherwise this input
-will be ignored.
+If `range_given == True`, this specifies the maximum input value that needs to
+be represented, otherwise it is determined from the max value of the `input`
+tensor.
 END
   }
   attr {
     name: "signed_input"
     description: <<END
-If the quantization is signed or unsigned.
+Whether the quantization is signed or unsigned. (actually this parameter should
+have been called <b>`signed_output`</b>)
 END
   }
   attr {
@@ -35,7 +38,7 @@ END
   attr {
     name: "range_given"
     description: <<END
-If the range is given or should be computed from the tensor.
+Whether the range is given or should be determined from the `input` tensor.
 END
   }
   summary: "Quantizes then dequantizes a tensor."
@@ -46,48 +49,48 @@ This op simulates the precision loss from the quantized forward pass by:
 2. Dequantizing it back to floating point numbers for the following ops, most
    likely matmul.
 
-There are different ways to quantize. This version does not use the full range
-of the output type, choosing to elide the lowest possible value for symmetry
-(e.g., output range is -127 to 127, not -128 to 127 for signed 8 bit
-quantization), so that 0.0 maps to 0.
+There are different ways to quantize. This version uses only scaling, so 0.0
+maps to 0.
 
-To perform this op, we first find the range of values in our tensor. The range
-we use is always centered on 0, so we find m such that
+From the specified 'num_bits' in the quantized output type, it determines
+minimum and maximum representable quantized values.
 
-1. m = max(abs(input_min), abs(input_max)) if range_given is true,
-2. m = max(abs(min_elem(input)), abs(max_elem(input))) otherwise.
+e.g.
 
-Our input tensor range is then [-m, m].
+*   [-128, 127] for signed, num_bits = 8, or
+*   [0, 255] for unsigned, num_bits = 8.
 
-Next, we choose our fixed-point quantization buckets, [min_fixed, max_fixed].
-If signed_input is true, this is
+If range_given == False, the initial input_min, input_max will be determined
+automatically as the minimum and maximum values in the input tensor, otherwise
+the specified values of input_min, input_max are used.
 
-  [min_fixed, max_fixed ] =
-      [-(1 << (num_bits - 1) - 1), (1 << (num_bits - 1)) - 1].
+Note: If the input_min, input_max are specified, they do not need to equal the
+actual minimum and maximum values in the tensor. e.g. in some cases it may be
+beneficial to specify these values such that the low probability extremes of the
+input distribution are clipped.
 
-Otherwise, if signed_input is false, the fixed-point range is
+This op determines the maximum scale_factor that would map the initial
+[input_min, input_max] range to a range that lies within the representable
+quantized range.
 
-  [min_fixed, max_fixed] = [0, (1 << num_bits) - 1].
+It determines the scale from one of input_min and input_max, then updates the
+other one to maximize the respresentable range.
 
-From this we compute our scaling factor, s:
+e.g.
 
-  s = (max_fixed - min_fixed) / (2 * m).
+*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+    5.0]: it would use a scale_factor of -128 / -10.0 = 12.8 In this case, it
+    would update input_max to be 127 / 12.8 = 9.921875
+*   if the output is signed, num_bits = 8, [input_min, input_max] = [-10.0,
+    10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
+    would update input_min to be 128.0 / 12.7 = -10.07874
+*   if the output is unsigned, input_min is forced to be 0, and only the
+    specifide input_max is used.
 
-Now we can quantize and dequantize the elements of our tensor.  An element e
-is transformed into e':
+After determining the scale_factor and updating the input tange, it applies the
+following to each value in the 'input' tensor.
 
-  e' = (e * s).round_to_nearest() / s.
+output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 
-Note that we have a different number of buckets in the signed vs. unsigned
-cases.  For example, if num_bits == 8, we get 254 buckets in the signed case
-vs. 255 in the unsigned case.
-
-For example, suppose num_bits = 8 and m = 1.  Then
-
-  [min_fixed, max_fixed] = [-127, 127], and
-  s = (127 + 127) / 2 = 127.
-
-Given the vector {-1, -0.5, 0, 0.3}, this is quantized to
-{-127, -63, 0, 38}, and dequantized to {-1, -63.0/127, 0, 38.0/127}.
 END
 }
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 3b09ea2527..906d507c8a 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -23,6 +23,8 @@ limitations under the License.
 namespace tensorflow {
 namespace functor {
 
+// TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
+
 template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleFunctor {
   void operator()(const Device& d, typename TTypes<T>::ConstVec input,
@@ -49,56 +51,51 @@ struct QuantizeAndDequantizeOneScaleImpl {
     d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
     d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
 
-    // Make sure the range is symmetric for signed quantization, or start from
-    // 0 for unsigned quantization.
-    max_range = std::max(std::abs(max_range), std::abs(min_range));
+    // Calculate the range for the simulated integer quantization:
+    // e.g. [-128,127] for signed = true, num_bits = 8,
+    // or [0, 255] for signed = false, num_bits = 8.
+    const int64 min_quantized = signed_input ? -(1ULL << (num_bits - 1)) : 0;
+    const int64 max_quantized = min_quantized + ((1ULL << num_bits) - 1);
 
-    // If both min and max are 0, then the output should be just 0.
-    if (max_range == 0) {
-      out.device(d) = input.constant(T(0));
-      return;
-    }
+    // Determine the maximum scaling factor that would scale
+    // [min_range, max_range] to not exceed [min_quantized, max_quantized],
+    // while keeping 0 unchanged.
+    const T scale_from_min_side = (min_quantized * min_range > 0)
+                                      ? min_quantized / min_range
+                                      : std::numeric_limits<T>::max();
+    const T scale_from_max_side = (max_quantized * max_range > 0)
+                                      ? max_quantized / max_range
+                                      : std::numeric_limits<T>::max();
 
-    if (signed_input) {
-      min_range = -max_range;
+    // Note: Avoids changing the side of the range that determines scale.
+    T scale, inverse_scale;
+    if (scale_from_min_side < scale_from_max_side) {
+      scale = scale_from_min_side;
+      inverse_scale = min_range / min_quantized;
+      max_range = max_quantized * inverse_scale;
+    } else {
+      scale = scale_from_max_side;
+      inverse_scale = max_range / max_quantized;
+      min_range = min_quantized * inverse_scale;
+    }
 
-      // If it is signed, we try to keep 0.0 being 0 and drop one bucket. For
-      // example, if it is 8 bits, we have the range [-127, 127]. So for input
-      // range of [-x, x], the scale should be 254/(2*x).
-      T scale = static_cast<T>((uint64_t{1} << (num_bits - 1)) - 1) / max_range;
-      T inverse_scale = T(1.0) / scale;
-      if (range_given) {
-        out.device(d) =
-            ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) *
-                 scale +
-             T(0.5))
-                    .floor() *
-                inverse_scale +
-            min_range;
-      } else {
-        // No need to compare with min and max as they are measured from the
-        // tensor.
-        out.device(d) =
-            ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
-            min_range;
-      }
+    if (range_given) {
+      // Note: The clamping here is to avoid overflow in the quantized type.
+      // The semantics of the op does not guarantee to clamp to the specified
+      // min_range and max_range - because we may have changed either min_range
+      // or max_range.
+      out.device(d) =
+          ((input.cwiseMin(max_range).cwiseMax(min_range) - min_range) * scale +
+           T(0.5))
+                  .floor() *
+              inverse_scale +
+          min_range;
     } else {
-      min_range = 0;
-      // If it is unsigned and num_bits == 8, the range with 8 bits is [0, 255].
-      // If the input range is [0, x], then the scale is x/255 instead of 254 as
-      // in the case above.
-      T scale = static_cast<T>((uint64_t{1} << num_bits) - 1) / max_range;
-      T inverse_scale = 1.0 / scale;
-      if (range_given) {
-        out.device(d) =
-            ((input.cwiseMin(max_range).cwiseMax(min_range)) * scale + T(0.5))
-                .floor() *
-            inverse_scale;
-      } else {
-        // No need to compare with min and max as they are measured from the
-        // tensor.
-        out.device(d) = (input * scale + T(0.5)).floor() * inverse_scale;
-      }
+      // No need to clamp to min_range and max_range in this case as they were
+      // measured from the tensor.
+      out.device(d) =
+          ((input - min_range) * scale + T(0.5)).floor() * inverse_scale +
+          min_range;
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index e41df12d91..629c698503 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -105,13 +105,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
   // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(
-      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::FillValues<float>(&expected,
+                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -136,13 +136,13 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_V3) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
   AddInputFromArray<int32>(TensorShape({}), {8});    // num_bits
 
-  // With int8, the tensor is quantized to {-127, -63, 0, 38, 102, 70}.
-  // Scale is: 1/127
-  // Then it is dequantized to {-1, -63.0/127, 0, 38.0/127, 102.0/127, 70.0/127}
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+  // Scale is: 1/128
+  // Then it is dequantized to {-1, -64.0/128, 0, 38.0/128, 102.0/128, 71.0/128}
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(
-      &expected, {-1, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127, 70.0 / 127});
+  test::FillValues<float>(&expected,
+                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -166,12 +166,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
-  // Scale is: 1/7
+  // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+  // Scale is: 1/8
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -196,12 +195,11 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
   AddInputFromArray<int32>(TensorShape({}), {4});    // num_bits
 
-  // With int4, the tensor is quantized to {-7, -3, 0, 2, 6, 4}.
-  // Scale is: 1/7
+  // With int4, the tensor is quantized to {-8, -4, 0, 2, 6, 4}.
+  // Scale is: 1/8
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -3.0 / 7, 0, 2.0 / 7, 6.0 / 7, 4.0 / 7});
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.25, 0.75, 0.5});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -228,13 +226,14 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
 
   // Note that the range is given as [-1, 1].
-  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
   // 127}.
   // Scale is: 1/127
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
-  test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
-                                      102.0 / 127, 70.0 / 127, -1, 1});
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
@@ -258,13 +257,14 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
   AddInputFromArray<int32>(TensorShape({}), {8});     // num_bits
 
   // Note that the range is given as [-1, 1].
-  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -127,
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
   // 127}.
   // Scale is: 1/127
   TF_ASSERT_OK(RunOpKernel());
   Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
-  test::FillValues<float>(&expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127,
-                                      102.0 / 127, 70.0 / 127, -1, 1});
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
-- 
GitLab


From c8ee3ae53163b0cb12e1c9d6ecd23ab0b59c8f60 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Tue, 29 May 2018 13:53:17 -0700
Subject: [PATCH 011/610] [TF:XLA] Implement Bucketize.

PiperOrigin-RevId: 198452289
---
 tensorflow/compiler/tests/BUILD               | 13 ++++
 .../compiler/tests/bucketize_op_test.py       | 78 +++++++++++++++++++
 tensorflow/compiler/tf2xla/kernels/BUILD      |  1 +
 .../compiler/tf2xla/kernels/bucketize_op.cc   | 67 ++++++++++++++++
 4 files changed, 159 insertions(+)
 create mode 100644 tensorflow/compiler/tests/bucketize_op_test.py
 create mode 100644 tensorflow/compiler/tf2xla/kernels/bucketize_op.cc

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 4c291d2383..b51c11bf6e 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -120,6 +120,19 @@ tf_xla_py_test(
     ],
 )
 
+tf_xla_py_test(
+    name = "bucketize_op_test",
+    size = "small",
+    srcs = ["bucketize_op_test.py"],
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 tf_xla_py_test(
     name = "categorical_op_test",
     size = "small",
diff --git a/tensorflow/compiler/tests/bucketize_op_test.py b/tensorflow/compiler/tests/bucketize_op_test.py
new file mode 100644
index 0000000000..fde9759a1c
--- /dev/null
+++ b/tensorflow/compiler/tests/bucketize_op_test.py
@@ -0,0 +1,78 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for bucketize_op."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.compiler.tests.xla_test import XLATestCase
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class BucketizationOpTest(XLATestCase):
+
+  def testInt(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.int32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11])
+      expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+      self.assertAllEqual(expected_out,
+                          sess.run(op, {p: [-5, 0, 2, 3, 5, 8, 10, 11, 12]}))
+
+  def testFloat(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0., 3., 8., 11.])
+      expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
+      self.assertAllEqual(
+          expected_out,
+          sess.run(op, {p: [-5., 0., 2., 3., 5., 8., 10., 11., 12.]}))
+
+  def test2DInput(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.float32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0, 3, 8, 11])
+      expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
+      self.assertAllEqual(
+          expected_out, sess.run(op,
+                                 {p: [[-5, 0, 2, 3, 5], [8, 10, 11, 12, 0]]}))
+
+  def testInvalidBoundariesOrder(self):
+    with self.test_session() as sess:
+      p = array_ops.placeholder(dtypes.int32)
+      with self.test_scope():
+        op = math_ops._bucketize(p, boundaries=[0, 8, 3, 11])
+      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                   "Expected sorted boundaries"):
+        sess.run(op, {p: [-5, 0]})
+
+  def testBoundariesNotList(self):
+    with self.test_session():
+      with self.assertRaisesRegexp(TypeError, "Expected list.*"):
+        p = array_ops.placeholder(dtypes.int32)
+        with self.test_scope():
+          math_ops._bucketize(p, boundaries=0)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index e6da157c11..edd2ab6301 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -18,6 +18,7 @@ tf_kernel_library(
         "bcast_ops.cc",
         "bias_ops.cc",
         "binary_ops.cc",
+        "bucketize_op.cc",
         "cast_op.cc",
         "categorical_op.cc",
         "cholesky_op.cc",
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
new file mode 100644
index 0000000000..ca9a6b4068
--- /dev/null
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+
+#include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace {
+
+class BucketizeOp : public XlaOpKernel {
+ public:
+  explicit BucketizeOp(OpKernelConstruction* context) : XlaOpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("boundaries", &boundaries_));
+    OP_REQUIRES(context, std::is_sorted(boundaries_.begin(), boundaries_.end()),
+                errors::InvalidArgument("Expected sorted boundaries"));
+  }
+
+  void Compile(XlaOpKernelContext* context) override {
+    xla::XlaBuilder* builder = context->builder();
+    const DataType dtype = context->input_type(0);
+    xla::XlaOp input = context->Input(0);
+
+    xla::XlaOp boundaries = builder->ConstantR1<float>(boundaries_);
+    // TODO(phawkins): the following behavior matches the behavior of the core
+    // Bucketize kernel. However, comparing an int32 or int64 against float may
+    // lead to inaccurate bucketing due to rounding.
+    if (dtype == DT_DOUBLE) {
+      input = builder->ConvertElementType(input, xla::F64);
+      boundaries = builder->ConvertElementType(boundaries, xla::F64);
+    } else {
+      input = builder->ConvertElementType(input, xla::F32);
+    }
+    xla::XlaOp comparison = builder->ConvertElementType(
+        builder->Ge(builder->Broadcast(input, {1}), boundaries,
+                    /*broadcast_dimensions=*/{0}),
+        xla::S32);
+    xla::XlaOp buckets = builder->Reduce(
+        comparison, /*init_value=*/builder->ConstantR0<int32>(0),
+        /*computation=*/xla::CreateScalarAddComputation(xla::S32, builder),
+        /*dimensions_to_reduce=*/{0});
+    context->SetOutput(0, buckets);
+  }
+
+ private:
+  std::vector<float> boundaries_;
+};
+
+REGISTER_XLA_OP(Name("Bucketize"), BucketizeOp);
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From 657cc1d40cab29064508d74586c68b5846e46f00 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 19:12:06 +0000
Subject: [PATCH 012/610] Expose `tf.strings.split` with the new implementation

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/ops/string_ops.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index ae79c01949..62726434aa 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -91,6 +91,20 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   shape.set_shape([2])
   return sparse_tensor.SparseTensor(indices, values, shape)
 
+@tf_export("strings.split")
+def string_split_v2(source, sep=None):
+  if sep is None:
+    sep = ''
+  sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
+  source = ops.convert_to_tensor(source, dtype=dtypes.string)
+
+  indices, values, shape = gen_string_ops.string_split_v2(
+      source, sep=sep)
+  indices.set_shape([None, 2])
+  values.set_shape([None])
+  shape.set_shape([2])
+  return sparse_tensor.SparseTensor(indices, values, shape)
+
 
 def _reduce_join_reduction_dims(x, axis, reduction_indices):
   """Returns range(rank(x) - 1, 0, -1) if reduction_indices is None."""
-- 
GitLab


From 1f6a3666f45fe504f4f5f8d91a4215dcb4babda6 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 19:12:36 +0000
Subject: [PATCH 013/610] Add test cases for tf.strings.split

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/string_split_op.cc    | 31 ++++++++--
 tensorflow/core/ops/string_ops.cc             |  1 +
 .../kernel_tests/string_split_op_test.py      | 61 +++++++++++++++++++
 tensorflow/python/ops/string_ops.py           |  4 +-
 4 files changed, 91 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/kernels/string_split_op.cc b/tensorflow/core/kernels/string_split_op.cc
index aeaa562fe7..3996ff0027 100644
--- a/tensorflow/core/kernels/string_split_op.cc
+++ b/tensorflow/core/kernels/string_split_op.cc
@@ -44,7 +44,7 @@ std::vector<string> Split(const string& str, const string& delimiter,
   return char_vector;
 }
 
-std::vector<string> SplitV2(const string& str, StringPiece sep) {
+std::vector<string> SplitV2(const string& str, StringPiece sep, int maxsplit) {
   // This SplitV2 method matches the behavior of python's str.split:
   //   If sep is given, consecutive delimiters are not grouped together
   //   and are deemed to delimit empty strings (for example, '1,,2'.split(',')
@@ -59,25 +59,42 @@ std::vector<string> SplitV2(const string& str, StringPiece sep) {
   //   splitting an empty string or a string consisting of just whitespace
   //   with a None separator returns [].
 
+  std::vector<string> result;
+
   StringPiece text(str);
+  if (maxsplit == 0) {
+    result.emplace_back(std::string(text));
+    return result;
+  }
 
-  std::vector<string> result;
   if (sep.empty()) {
     StringPiece token;
     // Remove leading whitespaces.
     str_util::RemoveLeadingWhitespace(&text);
+    int split = 0;
     while (str_util::ConsumeNonWhitespace(&text, &token)) {
       result.emplace_back(std::string(token));
       str_util::RemoveLeadingWhitespace(&text);
+      ++split;
+      if (maxsplit > 0 && split == maxsplit) {
+        result.emplace_back(std::string(text));
+        return result;
+      }
     }
     return result;
   }
   auto p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
+  int split = 0;
   while (p != text.end()) {
     StringPiece token = text.substr(0, p - text.begin());
     result.emplace_back(std::string(token));
     text.remove_prefix(token.size());
     text.remove_prefix(sep.size());
+    ++split;
+    if (maxsplit > 0 && split == maxsplit) {
+      result.emplace_back(std::string(text));
+      return result;
+    }
     p = std::search(text.begin(), text.end(), sep.begin(), sep.end());
   }
   result.emplace_back(std::string(text));
@@ -165,7 +182,10 @@ class StringSplitOp : public OpKernel {
 
 class StringSplitV2Op : public OpKernel {
  public:
-  explicit StringSplitV2Op(OpKernelConstruction* context) : OpKernel(context) {}
+  explicit StringSplitV2Op(OpKernelConstruction* context)
+      : OpKernel(context), maxsplit_(-1) {
+    context->GetAttr("maxsplit", &maxsplit_);
+  }
 
   void Compute(OpKernelContext* ctx) override {
     const Tensor* input_tensor;
@@ -193,7 +213,7 @@ class StringSplitV2Op : public OpKernel {
     int64 max_num_entries = 0;
     std::vector<int64> num_indices(batch_size);
     for (int64 i = 0; i < batch_size; ++i) {
-      std::vector<string> parts = SplitV2(input_vec(i), sep);
+      std::vector<string> parts = SplitV2(input_vec(i), sep, maxsplit_);
       int64 n_entries = parts.size();
       num_indices[i] = n_entries;
       output_size += n_entries;
@@ -225,6 +245,9 @@ class StringSplitV2Op : public OpKernel {
       }
     }
   }
+
+ private:
+  int maxsplit_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("StringSplit").Device(DEVICE_CPU), StringSplitOp);
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index d4d4a32236..7668ac0fda 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -140,6 +140,7 @@ REGISTER_OP("StringSplitV2")
     .Output("indices: int64")
     .Output("values: string")
     .Output("shape: int64")
+    .Attr("maxsplit: int = -1")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &unused));
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index a5bd1b6ee0..e442ea2b8e 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -146,5 +146,66 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(shape, [3, 1])
 
 
+class StringSplitV2OpTest(test.TestCase):
+
+  def testSplitV2(self):
+    strings = ["pigs on the wing", "animals"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
+      self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
+      self.assertAllEqual(shape, [2, 4])
+
+  def testSplitV2MultiCharSeparator(self):
+    # Match Python behavior:
+    # >>> '1<>2<>3'.split('<>')
+    # ['1', '2', '3']
+    # >>> "<><>4<>5<><>6<>".split("<>")
+    # ['', '', '4', '5', '', '6', '']
+    strings = ["1<>2<>3", "<><>4<>5<><>6<>"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep="<>")
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 7])
+
+  def testSplitV2SimpleSeparator(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',')
+    # ['1', '2', '3']
+    # >>> '1,2,,3,'.split(',')
+    # ['1', '2', '', '3', '']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',')
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(shape, [2, 5])
+
+  def testSplitV2EmptySeparator(self):
+    # Match Python behavior:
+    # >>> '1 2 3'.split()
+    # ['1', '2', '3']
+    #>>> '   1   2   3   '.split()
+    #['1', '2', '3']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
+                                    [1, 0], [1, 1], [1, 2]])
+      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
+      self.assertAllEqual(shape, [2, 3])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 62726434aa..961e63d04e 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -92,14 +92,14 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 @tf_export("strings.split")
-def string_split_v2(source, sep=None):
+def string_split_v2(source, sep=None, maxsplit=-1):
   if sep is None:
     sep = ''
   sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
   source = ops.convert_to_tensor(source, dtype=dtypes.string)
 
   indices, values, shape = gen_string_ops.string_split_v2(
-      source, sep=sep)
+      source, sep=sep, maxsplit=maxsplit)
   indices.set_shape([None, 2])
   values.set_shape([None])
   shape.set_shape([2])
-- 
GitLab


From 1c945cf4b7bdab30b084488f1f961a779abbd00e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 19:50:13 +0000
Subject: [PATCH 014/610] Update test case for maxsplit support with
 tf.strings.split

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../kernel_tests/string_split_op_test.py      | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index e442ea2b8e..1295316c0a 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -206,6 +206,38 @@ class StringSplitV2OpTest(test.TestCase):
       self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
       self.assertAllEqual(shape, [2, 3])
 
+  def testSplitV2SimpleSeparatorMaxSplit(self):
+    # Match Python behavior:
+    # >>> '1,2,3'.split(',', maxsplit=1)
+    # ['1', '2,3']
+    # >>> '4,5,,6,'.split(',', maxsplit=1)
+    # ['4', '5,,6,']
+    strings = ["1,2,3", "4,5,,6,"]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
+      self.assertAllEqual(shape, [2, 2])
+
+  def testSplitV2EmptySeparatorMaxSplit(self):
+    # Match Python behavior:
+    # '1 2 3'.split(maxsplit=1)
+    # ['1', '2 3']
+    # >>> "  4  5    6  ".split(maxsplit=1)
+    # ['4', '5    6  ']
+    strings = ["1 2 3", "  4  5    6  "]
+
+    with self.test_session() as sess:
+      tokens = string_ops.string_split_v2(strings, maxsplit=1)
+      indices, values, shape = sess.run(tokens)
+      self.assertAllEqual(indices, [[0, 0], [0, 1],
+                                    [1, 0], [1, 1]])
+      self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
+      self.assertAllEqual(shape, [2, 2])
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 003484dc049ac1df55912b53826d473d99819ee1 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 19:55:41 +0000
Subject: [PATCH 015/610] Pylint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/kernel_tests/string_split_op_test.py       | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index 1295316c0a..e20daccb28 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -169,9 +169,11 @@ class StringSplitV2OpTest(test.TestCase):
     with self.test_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep="<>")
       indices, values, shape = sess.run(tokens)
-      self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
-                                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
-      self.assertAllEqual(values, [b"1", b"2", b"3", b"", b"", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(
+          indices, [[0, 0], [0, 1], [0, 2],
+                    [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"", b"", b"4", b"5", b"", b"6", b""])
       self.assertAllEqual(shape, [2, 7])
 
   def testSplitV2SimpleSeparator(self):
@@ -187,7 +189,8 @@ class StringSplitV2OpTest(test.TestCase):
       indices, values, shape = sess.run(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
-      self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"", b"6", b""])
+      self.assertAllEqual(values, [b"1", b"2", b"3",
+                                   b"4", b"5", b"", b"6", b""])
       self.assertAllEqual(shape, [2, 5])
 
   def testSplitV2EmptySeparator(self):
-- 
GitLab


From a81adaf865d4ce5f0452db3f619df4fc23c5a327 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 29 May 2018 21:05:30 +0000
Subject: [PATCH 016/610] Update API defs

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../base_api/api_def_StringSplitV2.pbtxt      | 48 +++++++++++++++++++
 .../python_api/api_def_StringSplitV2.pbtxt    |  4 ++
 tensorflow/python/ops/string_ops.py           | 39 +++++++++++++++
 .../tools/api/golden/tensorflow.strings.pbtxt |  4 ++
 4 files changed, 95 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..6e13d0d049
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,48 @@
+op {
+  graph_op_name: "StringSplitV2"
+  in_arg {
+    name: "input"
+    description: <<END
+`1-D` string `Tensor`, the strings to split.
+END
+  }
+  in_arg {
+    name: "sep"
+    description: <<END
+`0-D` string `Tensor`, the delimiter character.
+END
+  }
+  attr {
+    name: "maxsplit"
+    description: <<END
+An `int`. If `maxsplit > 0`, limit of the split of the result.
+END
+  }
+  summary: "Split elements of `source` based on `sep` into a `SparseTensor`."
+  description: <<END
+Let N be the size of source (typically N will be the batch size). Split each
+element of `source` based on `sep` and return a `SparseTensor`
+containing the split tokens. Empty tokens are ignored.
+
+For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+then the output will be
+```
+st.indices = [0, 0;
+              0, 1;
+              1, 0;
+              1, 1;
+              1, 2]
+st.shape = [2, 3]
+st.values = ['hello', 'world', 'a', 'b', 'c']
+```
+
+If `sep` is given, consecutive delimiters are not grouped together and are
+deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+string, consecutive whitespace are regarded as a single separator, and the
+result will contain no empty strings at the startor end if the string has
+leading or trailing whitespace.
+
+Note that the above mentioned behavior matches python's str.split.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
new file mode 100644
index 0000000000..0e8576fb01
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_StringSplitV2.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "StringSplitV2"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 961e63d04e..0280c89c10 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -93,6 +93,45 @@ def string_split(source, delimiter=" ", skip_empty=True):  # pylint: disable=inv
 
 @tf_export("strings.split")
 def string_split_v2(source, sep=None, maxsplit=-1):
+  """Split elements of `source` based on `sep` into a `SparseTensor`.
+
+  Let N be the size of source (typically N will be the batch size). Split each
+  element of `source` based on `sep` and return a `SparseTensor`
+  containing the split tokens. Empty tokens are ignored.
+
+  For example, N = 2, source[0] is 'hello world' and source[1] is 'a b c',
+  then the output will be
+
+  st.indices = [0, 0;
+                0, 1;
+                1, 0;
+                1, 1;
+                1, 2]
+  st.shape = [2, 3]
+  st.values = ['hello', 'world', 'a', 'b', 'c']
+
+  If `sep` is given, consecutive delimiters are not grouped together and are
+  deemed to delimit empty strings. For example, source of `"1<>2<><>3"` and
+  sep of `"<>"` returns `["1", "2", "", "3"]`. If `sep` is None or an empty
+  string, consecutive whitespace are regarded as a single separator, and the
+  result will contain no empty strings at the startor end if the string has
+  leading or trailing whitespace.
+
+  Note that the above mentioned behavior matches python's str.split.
+
+  Args:
+    source: `1-D` string `Tensor`, the strings to split.
+    sep: `0-D` string `Tensor`, the delimiter character.
+    maxsplit: An `int`. If `maxsplit > 0`, limit of the split of the result.
+
+  Raises:
+    ValueError: If sep is not a string.
+
+  Returns:
+    A `SparseTensor` of rank `2`, the strings split according to the delimiter.
+    The first column of the indices corresponds to the row in `source` and the
+    second column corresponds to the index of the split component in this row.
+  """
   if sep is None:
     sep = ''
   sep = ops.convert_to_tensor(sep, dtype=dtypes.string)
diff --git a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
index a3fbe95bba..b641c39feb 100644
--- a/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.strings.pbtxt
@@ -4,4 +4,8 @@ tf_module {
     name: "regex_full_match"
     argspec: "args=[\'input\', \'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "split"
+    argspec: "args=[\'source\', \'sep\', \'maxsplit\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+  }
 }
-- 
GitLab


From 4a1d1c8413a3752af7dc91a7128e202660b0f05c Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 21 May 2018 14:58:23 +0000
Subject: [PATCH 017/610] Fix mismatch of shape restriction in
 DrawBoundingBoxes

In the kernel of DrawBoundingBoxes, the shape of the input
images should be 4-D. Though in the shape function,
at the end `UnchangedShapeWithRankAtLeast(c, 3)` was used instead
(at the beginning of the shape function the validation is
`WithRank(c->input(0), 4, &images)` which is correct).

This fix address the discrepancy by changing to `UnchangedShape`.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/image_ops.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/image_ops.cc b/tensorflow/core/ops/image_ops.cc
index d949e70c66..87f4991134 100644
--- a/tensorflow/core/ops/image_ops.cc
+++ b/tensorflow/core/ops/image_ops.cc
@@ -454,7 +454,9 @@ REGISTER_OP("DrawBoundingBoxes")
       DimensionHandle unused;
       TF_RETURN_IF_ERROR(c->WithValue(c->Dim(boxes, 2), 4, &unused));
 
-      return shape_inference::UnchangedShapeWithRankAtLeast(c, 3);
+      // The rank of the input image (rank = 4) has already been restricted
+      // above, and the output is of the same shape as the input.
+      return shape_inference::UnchangedShape(c);
     });
 
 // --------------------------------------------------------------------------
-- 
GitLab


From d30df026d93948c1556cdf339f0583f80e80d23f Mon Sep 17 00:00:00 2001
From: ctiijima <ctiijima@us.ibm.com>
Date: Tue, 29 May 2018 14:15:27 -0700
Subject: [PATCH 018/610] Fix redundancy in RELEASE.md

---
 RELEASE.md | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 84d9d52868..27f73b7fc6 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -404,14 +404,6 @@ answered questions, and were part of inspiring discussions.
 
 # Release 1.4.0
 
-## Major Features And Improvements
-* `tf.keras` is now part of the core TensorFlow API.
-* [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-  the core TensorFlow API.
-  * The API is now subject to backwards compatibility guarantees.
-
-# Release 1.4.0
-
 ## Major Features And Improvements
 * `tf.keras` is now part of the core TensorFlow API.
 * [`tf.data`](http://tensorflow.org/programmers_guide/datasets) is now part of
-- 
GitLab


From c835bd4f76abbbeb0c05a5e806c3e4b418582f06 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Tue, 29 May 2018 14:22:18 -0700
Subject: [PATCH 019/610] [tf.data] better benchmarking code in tests for
 measuring improvements to csv parsing

PiperOrigin-RevId: 198457501
---
 .../kernel_tests/csv_dataset_op_test.py       | 71 +++++++++++++------
 1 file changed, 49 insertions(+), 22 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index f9f11a1555..8c138c7081 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import string
 import tempfile
 import time
 
@@ -329,67 +330,93 @@ class CsvDatasetOpTest(test.TestCase):
 class CsvDatasetBenchmark(test.Benchmark):
   """Benchmarks for the various ways of creating a dataset from CSV files.
   """
+  FLOAT_VAL = '1.23456E12'
+  STR_VAL = string.ascii_letters * 10
 
-  def _setUp(self):
+  def _setUp(self, str_val):
     # Since this isn't test.TestCase, have to manually create a test dir
     gfile.MakeDirs(googletest.GetTempDir())
     self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
 
     self._num_cols = [4, 64, 256]
-    self._batch_size = 500
+    self._num_per_iter = 5000
     self._filenames = []
     for n in self._num_cols:
       fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
       with open(fn, 'w') as f:
-        # Just write 10 rows and use `repeat`...
-        row = ','.join(['1.23456E12' for _ in range(n)])
-        f.write('\n'.join([row for _ in range(10)]))
+        # Just write 100 rows and use `repeat`... Assumes the cost
+        # of creating an iterator is not significant
+        row = ','.join([str_val for _ in range(n)])
+        f.write('\n'.join([row for _ in range(100)]))
       self._filenames.append(fn)
 
   def _tearDown(self):
     gfile.DeleteRecursively(self._temp_dir)
 
   def _runBenchmark(self, dataset, num_cols, prefix):
-    next_element = dataset.make_one_shot_iterator().get_next()
-    with session.Session() as sess:
-      for _ in range(5):
-        sess.run(next_element)
-      deltas = []
-      for _ in range(10):
+    dataset = dataset.skip(self._num_per_iter - 1)
+    deltas = []
+    for _ in range(10):
+      next_element = dataset.make_one_shot_iterator().get_next()
+      with session.Session() as sess:
         start = time.time()
+        # NOTE: This depends on the underlying implementation of skip, to have
+        # the net effect of calling `GetNext` num_per_iter times on the
+        # input dataset. We do it this way (instead of a python for loop, or
+        # batching N inputs in one iter) so that the overhead from session.run
+        # or batch doesn't dominate. If we eventually optimize skip, this has
+        # to change.
         sess.run(next_element)
         end = time.time()
-        deltas.append(end - start)
-    median_wall_time = np.median(deltas) / 100
+      deltas.append(end - start)
+    # Median wall time per CSV record read and decoded
+    median_wall_time = np.median(deltas) / self._num_per_iter
     print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
                                                     median_wall_time))
     self.report_benchmark(
-        iters=self._batch_size,
+        iters=self._num_per_iter,
         wall_time=median_wall_time,
         name='%s_with_cols_%d' % (prefix, num_cols))
 
-  def benchmarkBatchThenMap(self):
-    self._setUp()
+  def benchmarkMapWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
     for i in range(len(self._filenames)):
       num_cols = self._num_cols[i]
       kwargs = {'record_defaults': [[0.0]] * num_cols}
       dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
       dataset = dataset.map(lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      dataset = dataset.batch(self._batch_size)
-      self._runBenchmark(dataset, num_cols, 'csv_map_then_batch')
+      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkMapWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: gen_parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
     self._tearDown()
 
-  def benchmarkCsvDataset(self):
-    self._setUp()
+  def benchmarkCsvDatasetWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
     for i in range(len(self._filenames)):
       num_cols = self._num_cols[i]
       kwargs = {'record_defaults': [[0.0]] * num_cols}
       dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
       dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      dataset = dataset.batch(self._batch_size)
-      self._runBenchmark(dataset, num_cols, 'csv_fused_dataset')
+      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
     self._tearDown()
 
+  def benchmarkCsvDatasetWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
+    self._tearDown()
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 8acd75a151ce4bee08afe2bcaebe36489b6140fb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 14:28:59 -0700
Subject: [PATCH 020/610] In TPUEstimator.export_savedmodel(), if saving TPU
 metegraph fails, issue a warning instead so that user can still use the CPU
 metagraph.

PiperOrigin-RevId: 198458571
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index f27375637a..3ea06fdeb5 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1968,13 +1968,18 @@ class TPUEstimator(estimator_lib.Estimator):
                              input_receiver_fn_map[mode]}
     export_tags = [tag_constants.SERVING, tag_constants.TPU]
     mode = _REWRITE_FOR_INFERENCE_MODE
-    super(TPUEstimator, self)._add_meta_graph_for_mode(builder,
-                                                       input_receiver_fn_map,
-                                                       checkpoint_path,
-                                                       strip_default_attrs,
-                                                       save_variables=False,
-                                                       mode=mode,
-                                                       export_tags=export_tags)
+    try:
+      (super(TPUEstimator, self).
+       _add_meta_graph_for_mode(builder,
+                                input_receiver_fn_map,
+                                checkpoint_path,
+                                strip_default_attrs,
+                                save_variables=False,
+                                mode=mode,
+                                export_tags=export_tags))
+    except Exception as error:  # pylint: disable=broad-except
+      logging.warning('Saving meta graph for TPU failed: {}.'
+                      .format(str(error)))
 
   def _call_model_fn(self, features, labels, mode, config):
     if mode == _REWRITE_FOR_INFERENCE_MODE:
-- 
GitLab


From 8e0811dd1f82bd2207d3b639acaa618942ddec95 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Tue, 29 May 2018 15:31:04 -0700
Subject: [PATCH 021/610] Adding a check in eager metrics to make sure that the
 shapes of labels and predictions are exactly the same. The issue is that
 math_ops.equal would do broadcasting and so even if the shapes weren't
 entirely equal it'll produce an output which would be incorrect rather that
 reporting an error.

PiperOrigin-RevId: 198468251
---
 tensorflow/contrib/eager/python/metrics_impl.py | 4 ++++
 tensorflow/contrib/eager/python/metrics_test.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index 1ae6415d5e..c947ed9dcc 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -367,6 +368,9 @@ class Accuracy(Mean):
     Returns:
       The arguments, for easy chaining.
     """
+    check_ops.assert_equal(
+        array_ops.shape(labels), array_ops.shape(predictions),
+        message="Shapes of labels and predictions are unequal")
     matches = math_ops.equal(labels, predictions)
     matches = math_ops.cast(matches, dtypes.float64)
     super(Accuracy, self).call(matches, weights=weights)
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 98a98a8d35..02ee054875 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.summary import summary_test_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -117,6 +118,11 @@ class MetricsTest(test.TestCase):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
+  def testAccuracyDifferentShapes(self):
+    m = metrics.Accuracy()
+    with self.assertRaises(errors.InvalidArgumentError):
+      m([[0], [0]], [0, 1])
+
   def testWeightedAccuracy(self):
     m = metrics.Accuracy()
     # 1 correct, total weight of 2
-- 
GitLab


From a176f8a5176527a61f32d48ee602093a97336fc5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 15:56:42 -0700
Subject: [PATCH 022/610]   streaming trace viewer need to filter host.

PiperOrigin-RevId: 198471853
---
 tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
index 8b0bbde98e..d3c34bfd49 100644
--- a/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
+++ b/tensorflow/contrib/tpu/profiler/tpu_profiler_analysis.proto
@@ -38,6 +38,9 @@ message EnumProfileSessionsAndToolsResponse {
 message ProfileSessionDataRequest {
   string repository_root = 1;
   string session_id = 2;
+  // Which host the data is associated. if empty, data from all hosts are
+  // aggregated.
+  string host_name = 5;
   // Which tool
   string tool_name = 3;
   // Tool's specific parameters. e.g. TraceViewer's viewport etc
-- 
GitLab


From e02106688578e8511fc767020e6f928ec65d5d73 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 29 May 2018 16:22:22 -0700
Subject: [PATCH 023/610] Add microbenchmarks for the executor.

PiperOrigin-RevId: 198475385
---
 .../core/common_runtime/executor_test.cc      | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index e34224205b..8cb1567852 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -410,4 +410,73 @@ TEST_F(ExecutorTest, RecvInvalidRefDtype) {
   rendez->Unref();
 }
 
+// Create a graph that is 'depth' deep. At each level, fan-in and fan-out a
+// maximum of 'width' nodes. All nodes are no-ops and all dependencies are
+// control dependencies.
+static void BM_executor(int iters, int width, int depth) {
+#ifdef PLATFORM_GOOGLE
+  BenchmarkUseRealTime();
+#endif  // PLATFORM_GOOGLE
+  Graph* g = new Graph(OpRegistry::Global());
+  random::PhiloxRandom philox(1729, 17);
+  random::SimplePhilox rand(&philox);
+  uint64 cur = 0;
+  uint32 r = 1 + rand.Rand32() % width;
+  std::vector<Node*> ready_nodes;
+  for (int i = 0; i < r; ++i) {
+    ready_nodes.push_back(test::graph::NoOp(g, {}));
+    ++cur;
+  }
+  for (int i = 0; i < depth; ++i) {
+    std::random_shuffle(ready_nodes.begin(), ready_nodes.end());
+    r = 1 + rand.Rand32() % (ready_nodes.size());
+    std::vector<Node*> control_inputs;
+    for (int j = 0; j < r; ++j) {
+      control_inputs.push_back(ready_nodes.back());
+      ready_nodes.pop_back();
+    }
+    Node* n = test::graph::NoOp(g, control_inputs);
+    ++cur;
+    r = 1 + rand.Rand32() % width;
+    for (int j = 0; j < r; ++j) {
+      ready_nodes.push_back(test::graph::NoOp(g, {n}));
+      ++cur;
+    }
+  }
+#ifdef PLATFORM_GOOGLE
+  SetBenchmarkLabel(strings::StrCat("Nodes = ", cur));
+  SetBenchmarkItemsProcessed(cur * static_cast<int64>(iters));
+#endif  // PLATFORM_GOOGLE
+  test::Benchmark("cpu", g).Run(iters);
+}
+
+// Tall skinny graphs
+BENCHMARK(BM_executor)->ArgPair(16, 1024);
+BENCHMARK(BM_executor)->ArgPair(32, 8192);
+
+// Short fat graphs
+BENCHMARK(BM_executor)->ArgPair(1024, 16);
+BENCHMARK(BM_executor)->ArgPair(8192, 32);
+
+// Tall fat graph
+BENCHMARK(BM_executor)->ArgPair(1024, 1024);
+
+static void BM_FeedInputFetchOutput(int iters) {
+  Graph* g = new Graph(OpRegistry::Global());
+  // z = x + y: x and y are provided as benchmark inputs.  z is the
+  // output of the benchmark.  Conceptually, the caller is "a", the
+  // benchmark is "b".
+  Node* x = test::graph::Recv(g, "x", "float", "a", 1, "b");
+  Node* y = test::graph::Recv(g, "y", "float", "a", 1, "b");
+  Node* sum = test::graph::Add(g, x, y);
+  Node* z = test::graph::Send(g, sum, "z", "b", 1, "a");
+  Tensor val(DT_FLOAT, TensorShape({}));
+  val.scalar<float>()() = 3.14;
+#ifdef PLATFORM_GOOGLE
+  SetBenchmarkItemsProcessed(static_cast<int64>(iters));
+#endif  // PLATFORM_GOOGLE
+  test::Benchmark("cpu", g).RunWithArgs({{x, val}, {y, val}}, {z}, iters);
+}
+BENCHMARK(BM_FeedInputFetchOutput);
+
 }  // namespace tensorflow
-- 
GitLab


From e9aeea1d326d8a55fa62306862a450231a874597 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 29 May 2018 16:22:46 -0700
Subject: [PATCH 024/610] Update setup.py with project description and
 development status.

PiperOrigin-RevId: 198475440
---
 tensorflow/tools/pip_package/setup.py | 36 +++++++++++++--------------
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 319878e1b5..70e6662763 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -12,6 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""TensorFlow is an open source machine learning framework for everyone.
+
+TensorFlow is an open source software library for high performance numerical
+computation. Its flexible architecture allows easy deployment of computation
+across a variety of platforms (CPUs, GPUs, TPUs), and from desktops to clusters
+of servers to mobile and edge devices.
+
+Originally developed by researchers and engineers from the Google Brain team
+within Google's AI organization, it comes with strong support for machine
+learning and deep learning and the flexible numerical computation core is used
+across many other scientific domains.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,26 +40,13 @@ from setuptools import setup
 from setuptools.command.install import install as InstallCommandBase
 from setuptools.dist import Distribution
 
+DOCLINES = __doc__.split('\n')
+
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
 _VERSION = '1.8.0'
 
-_SHORT_DESCRIPTION = ('TensorFlow is an open source machine learning framework '
-                      'for everyone.')
-
-_LONG_DESCRIPTION = ('TensorFlow is an open source software library for high '
-                     'performance numerical computation. Its flexible '
-                     'architecture allows easy deployment of computation across'
-                     ' a variety of platforms (CPUs, GPUs, TPUs), and from '
-                     'desktops to clusters of servers to mobile and edge '
-                     'devices. Originally developed by researchers and '
-                     'engineers from the Google Brain team within Google\'s AI '
-                     'organization, it comes with strong support for machine '
-                     'learning and deep learning and the flexible numerical '
-                     'computation core is used across many other scientific '
-                     'domains.')
-
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
     'astor >= 0.6.0',
@@ -229,9 +228,10 @@ headers = (list(find_files('*.h', 'tensorflow/core')) +
 setup(
     name=project_name,
     version=_VERSION.replace('-', ''),
-    description=_SHORT_DESCRIPTION,
-    long_description=_LONG_DESCRIPTION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
     url='https://www.tensorflow.org/',
+    download_url='https://github.com/tensorflow/tensorflow/tags',
     author='Google Inc.',
     author_email='opensource@google.com',
     # Contained modules and scripts.
@@ -257,7 +257,7 @@ setup(
     },
     # PyPI package information.
     classifiers=[
-        'Development Status :: 4 - Beta',
+        'Development Status :: 5 - Production/Stable',
         'Intended Audience :: Developers',
         'Intended Audience :: Education',
         'Intended Audience :: Science/Research',
-- 
GitLab


From e47996d8964f13bebe33ef863bb4f116ee789ac3 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Tue, 29 May 2018 16:33:25 -0700
Subject: [PATCH 025/610] Wraps the FinalOp exection with a user-friendly error
 mssage.

PiperOrigin-RevId: 198476911
---
 .../training/basic_session_run_hooks.py       | 22 +++++++++++++++++--
 .../training/basic_session_run_hooks_test.py  | 22 +++++++++++++++++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 9b40817f55..b0dd188db1 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -28,6 +28,7 @@ from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.util.event_pb2 import SessionLog
 from tensorflow.python.client import timeline
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
@@ -818,8 +819,25 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
 
   def end(self, session):
     if self._final_ops is not None:
-      self._final_ops_values = session.run(self._final_ops,
-                                           feed_dict=self._final_ops_feed_dict)
+      try:
+        self._final_ops_values = session.run(
+            self._final_ops, feed_dict=self._final_ops_feed_dict)
+      except (errors.OutOfRangeError, StopIteration) as e:
+        logging.warning(
+            "An OutOfRangeError or StopIteration exception is raised by the "
+            "code in FinalOpsHook. This typically means the Ops running by the "
+            "FinalOpsHook have a dependency back to some input source, which "
+            "should not happen. For example, for metrics in "
+            "tf.estimator.Estimator, all metrics functions return two Ops: "
+            "`value_op` and  `update_op`. Estimator.evaluate calls the "
+            "`update_op` for each batch of the data in input source and, once "
+            "it is exhausted, it call the `value_op` to get the metric values. "
+            "The `value_op` here should have dependency back to variables "
+            "reading only, rather than reading another batch from input. "
+            "Otherwise, the `value_op`, executed by `FinalOpsHook`, triggers "
+            "another data reading, which ends OutOfRangeError/StopIteration. "
+            "Please fix that.")
+        raise e
 
 
 @tf_export("train.FeedFnHook")
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 21c584f2ee..b49a871a56 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -29,8 +29,10 @@ from tensorflow.contrib.framework.python.framework import checkpoint_utils
 from tensorflow.contrib.framework.python.ops import variables
 from tensorflow.contrib.testing.python.framework import fake_summary_writer
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -1328,6 +1330,26 @@ class FinalOpsHookTest(test.TestCase):
         self.assertListEqual(expected_values,
                              hook.final_ops_values.tolist())
 
+  def test_final_ops_triggers_out_of_range_error(self):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.range(1)
+      iterator = dataset.make_one_shot_iterator()
+      read_ops = iterator.get_next()
+      final_ops = read_ops
+
+      hook = basic_session_run_hooks.FinalOpsHook(final_ops)
+      hook.begin()
+
+      with session_lib.Session() as session:
+        session.run(read_ops)
+        with test.mock.patch.object(tf_logging, 'warning') as mock_log:
+          with self.assertRaisesRegexp(errors.OutOfRangeError,
+                                       'End of sequence'):
+            hook.end(session)
+          self.assertRegexpMatches(
+              str(mock_log.call_args),
+              'dependency back to some input source')
+
   def test_final_ops_with_dictionary(self):
     with ops.Graph().as_default():
       expected_values = [4, -3]
-- 
GitLab


From 99ef7181786b4bc471b10582fdab21993bda152f Mon Sep 17 00:00:00 2001
From: Russell Power <power@google.com>
Date: Tue, 29 May 2018 16:36:16 -0700
Subject: [PATCH 026/610] Adjust TPUEstimator timeout for worker shutdown to 60
 seconds.

PiperOrigin-RevId: 198477309
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 3ea06fdeb5..aea9949290 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2228,11 +2228,11 @@ class TPUEstimator(estimator_lib.Estimator):
           if shutdown_mode:
             if shutdown_mode == 'shutdown_worker':
               finalizer_hooks = [
-                  session_support.ShutdownLameWorkers(timeout_ms=1000),
+                  session_support.ShutdownLameWorkers(timeout_ms=60*1000),
               ]
             elif shutdown_mode == 'shutdown_computation':
               finalizer_hooks = [
-                  session_support.RestartComputation(timeout_ms=1000),
+                  session_support.RestartComputation(timeout_ms=60*1000),
               ]
             else:
               raise ValueError('Unknown TF_TPU_GRACEFUL_SHUTDOWN_MODE "%s"' %
-- 
GitLab


From f3b20d8270c14302cb0734dfee806a022bcd5084 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 29 May 2018 16:41:00 -0700
Subject: [PATCH 027/610] Automated g4 rollback of changelist 198137414

PiperOrigin-RevId: 198477942
---
 tensorflow/compiler/xla/literal_comparison.cc | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index a588f4a03d..bf9679cafe 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -317,15 +317,7 @@ class NearComparator {
       rel_error = std::numeric_limits<float>::infinity();
     } else {
       abs_error = FpAbsoluteValue(actual - expected);
-      // If the expected result is exactly zero, don't compute relative error;
-      // that's meaningless.
-      //
-      // TODO(b/80321728): Come up with a better way to handle this case.
-      if (expected == NativeT{}) {
-        rel_error = 0;
-      } else {
-        rel_error = abs_error / FpAbsoluteValue(expected);
-      }
+      rel_error = abs_error / FpAbsoluteValue(expected);
     }
     const bool is_abs_mismatch = abs_error > error_.abs;
     const bool is_rel_mismatch = rel_error > error_.rel;
-- 
GitLab


From 631cd48bb71fb1fd30fa8e5b4d3be228ab200017 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 16:47:59 -0700
Subject: [PATCH 028/610] Fix documented numpy equivalent of
 matrix_triangular_solve.

PiperOrigin-RevId: 198478933
---
 .../core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
index a2bfcdc66e..e90de74109 100644
--- a/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MatrixTriangularSolve.pbtxt
@@ -32,7 +32,7 @@ Boolean indicating whether to solve with `matrix` or its (block-wise)
          adjoint.
 
 @compatibility(numpy)
-Equivalent to np.linalg.triangular_solve
+Equivalent to scipy.linalg.solve_triangular
 @end_compatibility
 END
   }
-- 
GitLab


From 5f9f3c73b7c2999ce4482a563a3659fd8d6b36a2 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Tue, 29 May 2018 16:57:17 -0700
Subject: [PATCH 029/610] Add tf.keras programmer's guide.

PiperOrigin-RevId: 198480159
---
 .../docs_src/programmers_guide/index.md       |  13 +-
 .../docs_src/programmers_guide/keras.md       | 715 ++++++++++++++++++
 .../docs_src/programmers_guide/leftnav_files  |   1 +
 3 files changed, 724 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/docs_src/programmers_guide/keras.md

diff --git a/tensorflow/docs_src/programmers_guide/index.md b/tensorflow/docs_src/programmers_guide/index.md
index 9ebfd39c56..0c2d4afb11 100644
--- a/tensorflow/docs_src/programmers_guide/index.md
+++ b/tensorflow/docs_src/programmers_guide/index.md
@@ -5,11 +5,14 @@ works. The units are as follows:
 
 ## High Level APIs
 
-  * @{$programmers_guide/eager}, which is the easiest way to use TensorFlow.
-  * @{$programmers_guide/estimators}, which introduces a high-level
-    TensorFlow API that greatly simplifies ML programming.
-  * @{$programmers_guide/datasets}, which explains how to
-    set up data pipelines to read data sets into your TensorFlow program.
+  * @{$programmers_guide/keras}, TensorFlow's high-level API for building and
+    training deep learning models.
+  * @{$programmers_guide/eager}, an API for writing TensorFlow code
+    imperatively, like you would use Numpy.
+  * @{$programmers_guide/estimators}, a high-level API that provides
+    fully-packaged models ready for large-scale training and production.
+  * @{$programmers_guide/datasets}, easy input pipelines to bring your data into
+    your TensorFlow program.
 
 ## Estimators
 
diff --git a/tensorflow/docs_src/programmers_guide/keras.md b/tensorflow/docs_src/programmers_guide/keras.md
new file mode 100644
index 0000000000..6a9df12a25
--- /dev/null
+++ b/tensorflow/docs_src/programmers_guide/keras.md
@@ -0,0 +1,715 @@
+# Keras
+
+## What's Keras?
+
+Keras is a high-level API specification for building and training deep learning
+models, suitable for fast prototyping, advanced research, and production.
+It offers three key advantages:
+
+- **User friendliness.** Keras follows best practices for reducing
+    cognitive load: it offers consistent & simple interfaces,
+    it minimizes the number of user actions required for common use cases,
+    and it provides clear and actionable feedback upon user error.
+- **Modularity and composability.** A Keras model is composed of
+    fully-configurable building blocks that can be plugged together
+    with as few restrictions as possible -- like Lego bricks.
+- **Easy extensibility.** You can easily write your own building blocks
+    (such as new layers, new loss functions, new models where you write
+    the forward pass from scratch). This allows for total expressiveness,
+    making Keras suitable for advanced research.
+
+
+## What's tf.keras?
+
+`tf.keras` is TensorFlow's implementation of the Keras API specification, that
+serves as the TensorFlow high-level API: it's how you build models in TensorFlow.
+`tf.keras` seamlessly integrates with the rest of the TensorFlow API
+(such as `tf.data` input pipelines), bringing you the full power and flexibility
+of TensorFlow through an easy-to-use interface.
+
+You can import `tf.keras` via:
+
+```python
+from tensorflow import keras
+```
+
+What follows is a quick introduction to the basics of `tf.keras`.
+
+
+## Table of contents
+
+- [Getting started: the Sequential model](#getting-started-the-sequential-model)
+- [Configuring layers](#configuring-layers)
+- [Configuring training](#configuring-training)
+- [Training and evaluation](#training-and-evaluation)
+- [Building advanced models: the functional API](#building-advanced-models-the-functional-api)
+- [Building fully-customizable research models: the Model subclassing API](#building-fully-customizable-research-models-the-model-subclassing-api)
+- [Callbacks](#callbacks)
+- [Saving and serialization](#saving-and-serialization)
+- [Developing custom layers](#developing-custom-layers)
+- [Eager execution](#eager-execution)
+- [Further reading](#further-reading)
+- [FAQ](#faq)
+
+
+---
+
+## Getting started: the Sequential model
+
+In `tf.keras`, you're assembling together **layers** to build **models**.
+A model is generally a graph of layers.
+The most common type of model is just a stack of layers: the `Sequential` class.
+
+Here's how to build a simple fully-connected network (multi-layer perceptron):
+
+```python
+from tensorflow import keras
+from tensorflow.keras import layers
+
+model = keras.Sequential()
+# This adds to the model a densely-connected layer with 64 units:
+model.add(Dense(64, activation='relu'))
+# Another one:
+model.add(Dense(64, activation='relu'))
+# This adds a softmax layer with 10 output units:
+model.add(Dense(10, activation='softmax'))
+```
+
+---
+
+## Configuring layers
+
+Each layer may have unique constructor arguments, but some common arguments include:
+
+- `activation`: the activation function to be used.
+    It could be specified by name, as a string (for built-in functions)
+    or as a callable object. By default, no activation is applied.
+- `kernel_initializer` and `bias_initializer`: the initialization schemes to use
+    to create the layer's weights (kernel and bias).
+    Likewise, they may be passed either by name or by specifying a callable.
+    By default, the "Glorot uniform" initializer is used.
+- `kernel_regularizer` and `bias_regularizer`: the regularization schemes to
+    apply to the layer's weights (kernel and bias), such as L1
+    or L2 regularization. By default, no regularization is applied.
+
+
+### Examples
+
+```python
+import tensorflow as tf
+from tensorflow.keras.layers import Dense
+from tensorflow.keras import regularizers
+from tensorflow.keras import initializers
+
+# A sigmoid layer:
+Dense(64, activation='sigmoid')
+# Another way to define the same sigmoid layer:
+Dense(64, activation=tf.sigmoid)
+
+# A linear layer with L1 regularization of factor 0.01
+# applied to the kernel matrix:
+Dense(64, kernel_regularizer=regularizers.l1(0.01))
+# A linear layer with L2 regularization of factor 0.01
+# applied to the bias vector:
+Dense(64, bias_regularizer=regularizers.l2(0.01))
+
+# A linear layer with a kernel initialized to a random orthogonal matrix:
+Dense(64, kernel_initializer='orthogonal')
+# A linear layer with a bias vector initialized to 2.0s:
+Dense(64, bias_initializer=initializers.constant(2.0))
+```
+
+---
+
+## Configuring training
+
+Once your model looks good, configure its learning process by calling `compile`:
+
+```python
+import tensorflow as tf
+
+model.compile(optimizer=tf.train.AdamOptimizer(0.001),
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+```
+
+There are three key arguments that you need to specify:
+
+- An `optimizer`: this object specifies the training procedure.
+    We recommend that you pass instances of optimizers from the `tf.train` module
+    (such as [`AdamOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer),
+    [`RMSPropOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer),
+    or [`GradientDescentOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer)).
+- A `loss` function to minimize: this specifies the optimization objective.
+    Common choices include mean square error (`mse`), `categorical_crossentropy`
+    and `binary_crossentropy`. Loss functions may be specified by name
+    or by passing a callable (e.g. from the `tf.keras.losses` module).
+- Some `metrics` to monitor during training: again, you can pass these as either
+    string names or callables (e.g. from the `tf.keras.metrics` module).
+
+
+### Examples
+
+```python
+# Configures a model to do mean-squared error regression.
+model.compile(optimizer=tf.train.AdamOptimizer(0.01),
+              loss='mse',  # mean squared error
+              metrics=['mae'])  # mean absolute error
+```
+```python
+# Configures a model to do categorical classification.
+model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
+              loss=tf.keras.losses.categorical_crossentropy,
+              metrics=[tf.keras.metrics.categorical_accuracy])
+```
+
+---
+
+## Training and evaluation
+
+### From Numpy data
+
+When running locally on small datasets, the easiest way to do training and
+evaluation is to pass data to your model as Numpy arrays of inputs and targets.
+You can "fit" your model to some training data using the `model.fit()` method:
+
+```python
+import numpy as np
+
+data = np.random.random(shape=(1000, 32))
+targets = np.random.random(shape=(1000, 10))
+
+model.fit(data, targets, epochs=10, batch_size=32)
+```
+
+Here are some key arguments you can pass to the `fit` method:
+
+- `epochs`: Training is structured into **epochs**. An epoch is one iteration
+    over the entire input data (which is done in smaller batches).
+- `batch_size`: when passing Numpy data, the model will slice the data into
+    smaller batches and iterate over these batches during training.
+    This integer specifies the size of each batch
+    (the last batch may be smaller if the total number of samples is not
+    divisible by the batch size).
+- `validation_data`: when prototyping a model, you want to be able to quickly
+    monitor its performance on some validation data.
+    When you pass this argument (it expects a tuple of inputs and targets),
+    the model will display the loss and metrics in inference mode on the data
+    you passed, at the end of each epoch.
+
+Here's an example using `validation_data`:
+
+```python
+import numpy as np
+
+data = np.random.random(shape=(1000, 32))
+targets = np.random.random(shape=(1000, 10))
+
+val_data = np.random.random(shape=(100, 32))
+val_targets = np.random.random(shape=(100, 10))
+
+model.fit(data, targets, epochs=10, batch_size=32,
+          validation_data=(val_data, val_targets))
+```
+
+### From tf.data datasets
+
+When you need to scale to large datasets or multi-device training,
+training from Numpy arrays in memory will not be ideal.
+In such cases, you should use [the `tf.data` API](https://www.tensorflow.org/programmers_guide/datasets).
+You can pass a `tf.data.Dataset` instance to the `fit` method:
+
+```python
+import tensorflow as tf
+
+# Instantiates a toy dataset instance:
+dataset = tf.data.Dataset.from_tensor_slices((data, targets)).batch(32)
+
+# Don't forget to specify `steps_per_epoch` when calling `fit` on a dataset.
+model.fit(dataset, epochs=10, steps_per_epoch=30)
+```
+
+When doing so, the dataset itself will yield batches of data,
+so the model does not need to be passed `batch_size` information.
+Instead, the model needs to know for how many steps (or batches of data)
+it should run at each epoch.
+You specify this with the `steps_per_epoch` argument: it's the number of
+training steps the model will run before moving on the next epoch.
+
+You can also pass datasets for validation:
+
+```python
+dataset = tf.data.Dataset.from_tensor_slices((data, targets)).batch(32)
+val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_targets)).batch(32)
+
+model.fit(dataset, epochs=10, steps_per_epoch=30, validation_data=val_dataset, validation_steps=3)
+```
+
+### Evaluate and predict
+
+In addition, you get access to the following methods
+(both with Numpy data and dataset instances):
+
+- `model.evaluate(x, y, batch_size=32)` or `model.evaluate(dataset, steps=30)`
+    will return the inference-mode loss and metrics for the data provided.
+- `model.predict(x, y, batch_size=32)` or `model.predict(dataset, steps=30)`
+    will return the output(s) of the last layer(s) in inference on the data
+    provided, as Numpy array(s).
+
+---
+
+## Building advanced models: the functional API
+
+The `Sequential` model cannot represent arbitrary models -- only simple stacks
+of layers. If you need to use more complex model topologies,
+such as multi-input models, multi-output models,
+models with a same layer called several times (shared layers),
+or models with non-sequential data flows (e.g. residual connections),
+you can use the 'functional API'.
+
+Here's how it works:
+
+- A layer instance is callable (on a tensor), and it returns a tensor.
+- Input tensor(s) and output tensor(s) can then be used to define a `Model` instance.
+- Such a model can be trained just like the `Sequential` model.
+
+Here's a basic example showing the same model we previously defined,
+built using the functional API:
+
+
+```python
+from tensorflow import keras
+from tensorflow.keras import layers
+
+# This returns a placeholder tensor:
+inputs = keras.Input(shape=(784,))
+
+# A layer instance is callable on a tensor, and returns a tensor.
+x = layers.Dense(64, activation='relu')(inputs)
+x = layers.Dense(64, activation='relu')(x)
+predictions = layers.Dense(10, activation='softmax')(x)
+
+# Instantiates the model given inputs and outputs.
+model = keras.Model(inputs=inputs, outputs=predictions)
+
+# The "compile" step specifies the training configuration.
+model.compile(optimizer='rmsprop',
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+
+# Trains for 5 epochs.
+model.fit(data, labels, batch_size=32, epochs=5)
+```
+
+This API enables you to create models with multiple inputs and outputs,
+and to "share" layers across different inputs
+(i.e. to reuse a same instance multiple times).
+For examples of these use cases,
+please see [this guide to the functional API in Keras](https://keras.io/getting-started/functional-api-guide/).
+
+---
+
+## Building fully-customizable research models: the Model subclassing API
+
+Besides `Sequential` and the functional API, one last, more flexible way to
+define models is to directly subclass the `Model` class and define your own
+forward pass manually.
+
+In this API, you instante layers in `__init__` and set them as attribute of the
+class instance. Then you specify the forward pass in `call`.
+This API is particularly valuable when using TensorFlow with [eager execution](https://www.tensorflow.org/programmers_guide/eager),
+since eager execution allows you to write your forward pass in an
+imperative fashion (as if you were writing Numpy code, for instance).
+
+```python
+import tensorflow as tf
+from tensorflow import keras
+
+
+class MyModel(keras.Model):
+
+  def __init__(self, num_classes=2):
+    super(MyModel, self).__init__(name='my_model')
+    self.num_classes = num_classes
+    # Define your layers here.
+    self.dense_1 = keras.layers.Dense(32, activation='relu')
+    self.dense_2 = keras.layers.Dense(num_classes, activation='sigmoid')
+
+  def call(self, inputs):
+    # Define your forward pass here,
+    # using layers you previously defined (in `__init__`).
+    x = self.dense_1(inputs)
+    return self.dense_2(x)
+
+  def compute_output_shape(self, input_shape):
+    # You need to override this function if you want to use the subclassed model
+    # as part of a functional-style model.
+    # Otherwise, this method is optional.
+    shape = tf.TensorShape(input_shape).as_list()
+    shape[-1] = self.num_classes
+    return tf.TensorShape(shape)
+
+
+# Instantiates the subclassed model.
+model = MyModel(num_classes=2)
+
+# The "compile" step specifies the training configuration.
+model.compile(optimizer='rmsprop',
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+
+# Trains for 5 epochs.
+model.fit(data, labels, batch_size=32, epochs=5)
+```
+
+**Remember:** use the right API for the right job.
+Using the `Model` subclassing API offers more flexibility,
+but at the cost of greater complexity and a larger potential user error surface.
+Prefer using the functional API when possible.
+
+---
+
+## Callbacks
+
+Callbacks are objects that you can pass to your model that customize and extend
+its behavior during training.
+There are callbacks for saving checkpoints of your model at regular intervals
+(`tf.keras.callbacks.ModelCheckpoint`),
+to dynamically change the learning rate (`tf.keras.callbacks.LearningRateScheduler`)
+or to interrupt training when validation performance has stopped improving
+(`tf.keras.callbacks.EarlyStopping`).
+You can also use a callback to monitor your model's behavior using
+[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard)
+(`tf.keras.callbacks.TensorBoard`).
+You can also write your own custom callbacks.
+
+Different built-in callback are found in `tf.keras.callbacks`.
+You use them by passing a `Callback` instance to `fit`:
+
+```python
+from tensorflow import keras
+
+callbacks = [
+    # Interrupt training if `val_loss` stops improving for over 2 epochs
+    keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
+    # Write TensorBoard logs to `./logs` directory
+    keras.callbacks.TensorBoard(log_dir='./logs')
+]
+model.fit(data, labels, batch_size=32, epochs=5, callbacks=callbacks)
+```
+
+---
+
+## Saving and serialization
+
+### Weights-only saving
+
+You can save the weight values of a model via `model.save_weights(filepath)`:
+
+```python
+# Saves weights to a SavedModel file.
+model.save_weights('my_model')
+
+# Restores the model's state
+# (this requires a model that has the same architecture).
+model.load_weights('my_model')
+```
+
+By default, this saves the weight in the TensorFlow
+[`SavedModel`](https://www.tensorflow.org/programmers_guide/saved_model) format.
+You could also save them in the Keras HDF5 format
+(which is the default in the multi-backend implementation of Keras):
+
+```python
+# Saves weights to a HDF5 file.
+model.save_weights('my_model.h5', format='h5')
+
+# Restores the model's state.
+model.load_weights('my_model.h5')
+```
+
+### Configuration-only saving (serialization)
+
+You can also save the model's configuration
+(its architecture, without any weight values),
+which allows you to recreate the same model later (freshly initialized) even if
+you don't have the code that defined it anymore.
+Two possible serialization formats are JSON and YAML:
+
+```python
+from tensorflow.keras import models
+
+# Serializes a model to JSON.
+json_string = model.to_json()
+# Recreates the model (freshly initialized).
+fresh_model = models.from_json(json_string)
+
+# Serializes a model to YAML.
+yaml_string = model.to_yaml()
+# Recreates the model.
+fresh_model = models.from_yaml(yaml_string)
+```
+
+Note that this feature is not available with subclassed models,
+because they are simply not serializable:
+their architecture is defined as Python code
+(the body of the `call` method of the model).
+
+### Whole-model saving
+
+Finally, you can also save a model wholesale, to a file that will contain both
+the weight values, the model's configuration,
+and even the optimizer's configuration.
+The allows you to checkpoint a model and resume training later --
+from the exact same state -- even if you don't have access to the original code.
+
+```python
+from tensorflow.keras import models
+
+model.save('my_model.h5')
+
+# Recreates the exact same model, complete with weights and optimizer.
+model = models.load_model('my_model.h5')
+```
+
+---
+
+## Developing custom layers
+
+You can write your own custom layers by subclassing the class
+`tf.keras.layers.Layer`. You will need to implement the following three methods:
+
+- `build`: Creates the weights of the layer.
+    Weights should be added via the `add_weight` method.
+- `call`: Specifies the forward pass.
+- `compute_output_shape`: Specifies how to compute the output shape of the layer 
+    given the input shape.
+
+Optionally, you may also implement the method `get_config()` and the
+class method `from_config()` if you want your layer to be serializable.
+
+Here's a simple example of a custom layer that implements a `matmul`
+of an input with a kernel matrix:
+
+```python
+import tensorflow as tf
+from tensorflow.keras import layers
+
+class MyLayer(layers.Layer):
+
+    def __init__(self, output_dim, **kwargs):
+        self.output_dim = output_dim
+        super(MyLayer, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        # Create a trainable weight variable for this layer.
+        self.kernel = self.add_weight(name='kernel', 
+                                      shape=(input_shape[1], self.output_dim),
+                                      initializer='uniform',
+                                      trainable=True)
+        # Be sure to call this at the end
+        super(MyLayer, self).build(input_shape)
+
+    def call(self, inputs):
+        return tf.matmul(inputs, self.kernel)
+
+    def compute_output_shape(self, input_shape):
+        shape = tf.TensorShape(input_shape).as_list()
+        shape[-1] = self.output_dim
+        return tf.TensorShape(shape)
+
+    def get_config(self):
+        base_config = super(MyLayer, self).get_config()
+        base_config['output_dim'] = self.output_dim
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+```
+
+---
+
+## Eager execution
+
+[Eager execution](https://www.tensorflow.org/programmers_guide/eager)
+is a way to write TensorFlow code imperatively.
+
+All three `tf.keras` model-building APIs
+(`Sequential`, the functional API `Model(inputs, outputs)`,
+and the subclassing API `MyModel(Model)`) are compatible with eager execution.
+When using `Sequential` or the functional API, it makes no difference to the
+user experience whether the model is executing eagerly or not.
+Eager execution is most beneficial when used with the `Model` subclassing API,
+or when prototyping a custom layer -- that is to say, in APIs that require you
+to *write a forward pass as code*, rather than in APIs that allow you to create
+models by assembling together existing layers.
+
+While the same training and evaluating APIs presented in this guide work
+as usual with eager execution, you can in addition
+write custom training loops using the eager `GradientTape`
+and define-by-run autodifferentiation:
+
+```python
+import tensorflow as tf
+from tensorflow.contrib import eager as tfe
+
+# This call begins the eager execution session.
+tf.enable_eager_execution()
+
+model = ...  # Defines a Keras model (we recommend Model subclassing in this case).
+dataset = ...  # Defines a `tf.data` dataset.
+
+optimizer = tf.train.AdamOptimizer(0.01)
+
+for data, labels in dataset:
+    # Runs the forward pass and loss computation under a `GradientTape` scope,
+    # which will record all operations in order to prepare for the backward pass.
+    with tfe.GradientTape() as tape:
+      predictions = model(data)
+      loss = loss_function(labels, predictions)
+
+    # Runs the backward pass manually using the operations recorded
+    # by the gradient tape.
+    grads = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(grads, model.trainable_weights),
+                              global_step=tf.train.get_or_create_global_step())
+```
+
+---
+
+## Further reading
+
+### Documentation
+
+- [tf.keras documentation](https://www.tensorflow.org/api_docs/python/tf/keras)
+- [keras.io](https://keras.io/)
+
+### tf.keras tutorials and examples
+
+- [Fashion-MNIST with tf.Keras](https://medium.com/tensorflow/hello-deep-learning-fashion-mnist-with-keras-50fcff8cd74a)
+- [Predicting the price of wine with the Keras Functional API and TensorFlow](
+    https://medium.com/tensorflow/predicting-the-price-of-wine-with-the-keras-functional-api-and-tensorflow-a95d1c2c1b03)
+
+
+---
+
+## FAQ
+
+### What are the differences between tf.keras and the multi-backend Keras implementation?
+
+`tf.keras` includes first-class support for important TensorFlow-specific
+functionality not found in other Keras implementations, in particular:
+
+- Support for eager execution.
+- Support for the `tf.data` API.
+- Integration with the
+    [`tf.estimator` API](https://www.tensorflow.org/programmers_guide/estimators),
+    via `tf.keras.estimator.model_to_estimator`.
+
+In terms of API differences: `tf.keras` is a full implementation of the
+Keras API, so any code targeting the Keras API will run on `tf.keras`.
+However, keep in mind that:
+
+- The `tf.keras` API version in the latest TensorFlow release might not be the
+    same as the latest `keras` version from PyPI.
+    Check out `tf.keras.__version__` if in doubt.
+- In `tf.keras`, the default file format saved by `model.save_weights` is the
+    TensorFlow `SavedModel` format.
+    To use HDF5, you can pass the `format='h5'` argument.
+
+
+### What is the relationship between tf.keras and tf.estimator?
+
+The [`tf.estimator` API](https://www.tensorflow.org/programmers_guide/estimators)
+is a high-level TensorFlow API for training "estimator" models,
+in particular in distributed settings.
+This API targets industry use cases, such as distributed training
+on large datasets with a focus on eventually exporting a production model.
+
+If you have a `tf.keras` model that would like to train with the `tf.estimator`
+API, you can convert your model to an `Estimator` object via the
+`model_to_estimator` utility](https://www.tensorflow.org/programmers_guide/estimators#creating_estimators_from_keras_models):
+
+
+```python
+estimator = tf.keras.estimator.model_to_estimator(model)
+```
+
+When using `model_to_estimator`, enabling eager execution is helpful for
+developing and debugging your `input_fn`
+(as it allows you to easily print your data).
+
+
+### How can I run tf.keras models on multiple GPUs?
+
+You can run tf.keras models on multiple GPUs using the
+[`DistributionStrategy API`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy).
+The `DistributionStrategy` API allow you to distribute training on multiple GPUs
+with almost no changes to your existing code.
+
+Currently [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy)
+is the only supported strategy.
+`MirroredStrategy` allows you to do in-graph replication with synchronous
+training using all-reduce on a single machine.
+To use `DistributionStrategy` with a `tf.keras` model,
+you can use the `model_to_estimator` utility to convert a `tf.keras` model to
+an `Estimator` and then train the estimator.
+
+Here is a simple example of distributing a `tf.keras` model across multiple GPUs
+on a single machine.
+
+Let's first define a simple model:
+
+```python
+model = tf.keras.Sequential()
+model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
+model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+optimizer = tf.train.GradientDescentOptimizer(0.2)
+model.compile(loss='binary_crossentropy', optimizer=optimizer)
+model.summary()
+```
+
+Let's use `model_to_estimator` to create an `Estimator` instance from the
+`tf.keras` model defined above.
+
+```python
+keras_estimator = tf.keras.estimator.model_to_estimator(
+    keras_model=model,
+    config=config,
+    model_dir='/tmp/model_dir')
+```
+
+We'll use `tf.data.Datasets` to define our input pipeline.
+Our `input_fn` returns a `tf.data.Dataset` object that we then use to distribute
+the data across multiple devices with each device processing
+a slice of the input batch.
+
+```python
+def input_fn():
+    x = np.random.random((1024, 10))
+    y = np.random.randint(2, size=(1024, 1))
+    x = tf.cast(x, tf.float32)
+    dataset = tf.data.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(10)
+    dataset = dataset.batch(32)
+    return dataset
+```
+
+The next step is to create a `RunConfig` and set the train_distribute argument
+to the new `MirroredStrategy` instance.
+You can specify a list of devices or the `num_gpus` argument when creating
+a `MirroredStrategy` instance.
+Not specifying any arguments defaults to using all the available GPUs like we do
+in this example.
+
+```python
+strategy = tf.contrib.distribute.MirroredStrategy()
+config = tf.estimator.RunConfig(train_distribute=strategy)
+```
+
+Call train on the `Estimator` instance providing the `input_fn` and `steps`
+arguments as input:
+
+```python
+keras_estimator.train(input_fn=input_fn, steps=10)
+```
diff --git a/tensorflow/docs_src/programmers_guide/leftnav_files b/tensorflow/docs_src/programmers_guide/leftnav_files
index 331317446a..3bcf864e13 100644
--- a/tensorflow/docs_src/programmers_guide/leftnav_files
+++ b/tensorflow/docs_src/programmers_guide/leftnav_files
@@ -1,6 +1,7 @@
 index.md
 
 ### High Level APIs
+keras.md
 eager.md
 datasets.md
 
-- 
GitLab


From 79755d82a02526950ee4bd3fbc11d515308e76fd Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Tue, 29 May 2018 17:08:59 -0700
Subject: [PATCH 030/610] Fixing a bug in `map_and_batch_fusion` and improving
 test coverage.

PiperOrigin-RevId: 198481898
---
 .../core/grappler/optimizers/data/BUILD       |   1 +
 .../optimizers/data/map_and_batch_fusion.cc   |  10 +-
 .../data/map_and_batch_fusion_test.cc         | 105 +++++++++++++-----
 3 files changed, 85 insertions(+), 31 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index d3fe7df583..121de1e089 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -60,6 +60,7 @@ tf_cc_test(
     deps = [
         ":graph_utils",
         ":map_and_batch_fusion",
+        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 5b8df61c48..290326ab75 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -97,11 +97,13 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     // Set `f` and `Targuments` attributes.
-    new_node->mutable_attr()->insert(map_node->attr().begin(),
-                                     map_node->attr().end());
+    for (auto key : {"f", "Targuments"}) {
+      (*new_node->mutable_attr())[key] = map_node->attr().at(key);
+    }
     // Set `output_types` and `output_shapes` attributes.
-    new_node->mutable_attr()->insert(batch_node.attr().begin(),
-                                     batch_node.attr().end());
+    for (auto key : {"output_shapes", "output_types"}) {
+      (*new_node->mutable_attr())[key] = batch_node.attr().at(key);
+    }
 
     // Mark the `Map` and `Batch` nodes for removal.
     nodes_to_delete.insert(map_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 51e7f37e7e..8c7498dc5d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h"
 
+#include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -25,8 +26,6 @@ namespace grappler {
 namespace {
 
 TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
-  std::vector<std::pair<string, AttrValue>> empty_attributes;
-
   GrapplerItem item;
   GraphDef *graph = &item.graph;
   NodeDef *start_node;
@@ -40,29 +39,48 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
   NodeDef *range_node;
   TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    empty_attributes, graph, &range_node));
+                                    range_attrs, graph, &range_node));
   NodeDef *captured_input_node;
   TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
       "hello", graph, &captured_input_node));
 
-  std::vector<string> map_inputs(2);
-  map_inputs[0] = range_node->name();
-  map_inputs[1] = captured_input_node->name();
   NodeDef *map_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs,
-                                    empty_attributes, graph, &map_node));
+  {
+    std::vector<string> map_inputs(2);
+    map_inputs[0] = range_node->name();
+    map_inputs[1] = captured_input_node->name();
+    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    AttrValue f_attr;
+    SetAttrValue("f", &f_attr);
+    map_attrs[0] = std::make_pair("f", f_attr);
+    AttrValue args_attr;
+    SetAttrValue("Targuments", &args_attr);
+    map_attrs[1] = std::make_pair("Targuments", args_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
+                                      graph, &map_node));
+  }
 
   NodeDef *batch_size_node;
   TF_ASSERT_OK(
       graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
-  std::vector<string> batch_inputs(2);
-  batch_inputs[0] = map_node->name();
-  batch_inputs[1] = batch_size_node->name();
   NodeDef *batch_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                    empty_attributes, graph, &batch_node));
+  {
+    std::vector<string> batch_inputs(2);
+    batch_inputs[0] = map_node->name();
+    batch_inputs[1] = batch_size_node->name();
+    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    AttrValue shapes_attr;
+    SetAttrValue("output_shapes", &shapes_attr);
+    batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+    AttrValue types_attr;
+    SetAttrValue("output_types", &types_attr);
+    batch_attrs[1] = std::make_pair("output_types", types_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, graph, &batch_node));
+  }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
@@ -84,11 +102,17 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
   NodeDef drop_remainder_node = output.node(
       graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
   EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
+                                 map_node->attr().at("f")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("Targuments"),
+                                 map_node->attr().at("Targuments")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_shapes"),
+                                 batch_node->attr().at("output_shapes")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_types"),
+                                 batch_node->attr().at("output_types")));
 }
 
 TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
-  std::vector<std::pair<string, AttrValue>> empty_attributes;
-
   GrapplerItem item;
   GraphDef *graph = &item.graph;
   NodeDef *start_node;
@@ -102,9 +126,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
   NodeDef *range_node;
   TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    empty_attributes, graph, &range_node));
+                                    range_attrs, graph, &range_node));
   NodeDef *captured_input_node;
   TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
       "hello", graph, &captured_input_node));
@@ -112,23 +137,41 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   TF_ASSERT_OK(
       graph_utils::AddScalarConstNode<int>(2, graph, &num_parallel_calls_node));
 
-  std::vector<string> map_inputs(3);
-  map_inputs[0] = range_node->name();
-  map_inputs[1] = captured_input_node->name();
-  map_inputs[2] = num_parallel_calls_node->name();
   NodeDef *map_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
-                                    empty_attributes, graph, &map_node));
+  {
+    std::vector<string> map_inputs(3);
+    map_inputs[0] = range_node->name();
+    map_inputs[1] = captured_input_node->name();
+    map_inputs[2] = num_parallel_calls_node->name();
+    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    AttrValue f_attr;
+    SetAttrValue("f", &f_attr);
+    map_attrs[0] = std::make_pair("f", f_attr);
+    AttrValue args_attr;
+    SetAttrValue("Targuments", &args_attr);
+    map_attrs[1] = std::make_pair("Targuments", args_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "ParallelMapDataset", map_inputs,
+                                      map_attrs, graph, &map_node));
+  }
 
   NodeDef *batch_size_node;
   TF_ASSERT_OK(
       graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
-  std::vector<string> batch_inputs(2);
-  batch_inputs[0] = map_node->name();
-  batch_inputs[1] = batch_size_node->name();
   NodeDef *batch_node;
-  TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
-                                    empty_attributes, graph, &batch_node));
+  {
+    std::vector<string> batch_inputs(2);
+    batch_inputs[0] = map_node->name();
+    batch_inputs[1] = batch_size_node->name();
+    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    AttrValue shapes_attr;
+    SetAttrValue("output_shapes", &shapes_attr);
+    batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+    AttrValue types_attr;
+    SetAttrValue("output_types", &types_attr);
+    batch_attrs[1] = std::make_pair("output_types", types_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                      batch_attrs, graph, &batch_node));
+  }
 
   MapAndBatchFusion optimizer;
   GraphDef output;
@@ -150,6 +193,14 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   NodeDef drop_remainder_node = output.node(
       graph_utils::FindNodeWithName(map_and_batch_node.input(4), output));
   EXPECT_EQ(drop_remainder_node.attr().at("value").tensor().bool_val(0), false);
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
+                                 map_node->attr().at("f")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("Targuments"),
+                                 map_node->attr().at("Targuments")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_shapes"),
+                                 batch_node->attr().at("output_shapes")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_types"),
+                                 batch_node->attr().at("output_types")));
 }
 
 TEST(MapAndBatchFusionTest, NoChange) {
-- 
GitLab


From ce88b47799caa472509a34c6c2e4265e2d16ceb9 Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Tue, 29 May 2018 17:42:37 -0700
Subject: [PATCH 031/610] Use absolute indexing in `fill_triangular`.

PiperOrigin-RevId: 198485926
---
 tensorflow/python/ops/distributions/util.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 728fda28c2..1b2c8762a4 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -914,10 +914,11 @@ def fill_triangular(x, upper=False, name=None):
     #   = 2 (n**2 / 2 + n / 2) - n**2
     #   = n**2 + n - n**2
     #   = n
+    ndims = array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims
     if upper:
-      x_list = [x, array_ops.reverse(x[..., n:], axis=[-1])]
+      x_list = [x, array_ops.reverse(x[..., n:], axis=[ndims - 1])]
     else:
-      x_list = [x[..., n:], array_ops.reverse(x, axis=[-1])]
+      x_list = [x[..., n:], array_ops.reverse(x, axis=[ndims - 1])]
     new_shape = (
         static_final_shape.as_list()
         if static_final_shape.is_fully_defined()
-- 
GitLab


From 7a4d278a3dbb71c0d707e2c5e99423489099f441 Mon Sep 17 00:00:00 2001
From: Alexander Gorban <gorban@google.com>
Date: Tue, 29 May 2018 17:51:13 -0700
Subject: [PATCH 032/610] Convenience functions to create TensorProto directly
 from data (std::vector).

PiperOrigin-RevId: 198486802
---
 tensorflow/core/framework/tensor_util.cc      |   9 ++
 tensorflow/core/framework/tensor_util.h       | 103 +++++++++++++
 tensorflow/core/framework/tensor_util_test.cc | 140 ++++++++++++++++++
 3 files changed, 252 insertions(+)

diff --git a/tensorflow/core/framework/tensor_util.cc b/tensorflow/core/framework/tensor_util.cc
index 8e3ac25512..65f6dc1c00 100644
--- a/tensorflow/core/framework/tensor_util.cc
+++ b/tensorflow/core/framework/tensor_util.cc
@@ -168,5 +168,14 @@ Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
   return Status::OK();
 }
 
+namespace internal {
+void SetTensorProtoShape(std::vector<size_t> shape,
+                         TensorShapeProto* shape_proto) {
+  for (auto dim : shape) {
+    shape_proto->mutable_dim()->Add()->set_size(dim);
+  }
+}
+}  // namespace internal
+
 }  // namespace tensor
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/tensor_util.h b/tensorflow/core/framework/tensor_util.h
index 6c218b69e0..43d2d95311 100644
--- a/tensorflow/core/framework/tensor_util.h
+++ b/tensorflow/core/framework/tensor_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_FRAMEWORK_TENSOR_UTIL_H_
 
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
 
 #include <vector>
 namespace tensorflow {
@@ -54,6 +55,108 @@ Status Concat(const gtl::ArraySlice<Tensor>& tensors,
 Status Split(const Tensor& tensor, const gtl::ArraySlice<int64>& sizes,
              std::vector<Tensor>* result) TF_MUST_USE_RESULT;
 
+namespace internal {
+void SetTensorProtoShape(std::vector<size_t> shape,
+                         TensorShapeProto* shape_proto);
+
+// Defines value type dependent methods to manipulate `TensorProto`.
+// Class specializations has to define following methods:
+//   static DataType GetDataType()
+//   static void AddValue(Type value, TensorProto* proto)
+template <typename Type>
+class TensorProtoHelper : public std::false_type {};
+
+template <>
+class TensorProtoHelper<string> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_STRING; }
+  static void AddValue(const string& value, TensorProto* proto) {
+    *proto->mutable_string_val()->Add() = value;
+  }
+};
+
+template <>
+class TensorProtoHelper<int32> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_INT32; }
+  static void AddValue(int32 value, TensorProto* proto) {
+    proto->mutable_int_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<int64> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_INT64; }
+  static void AddValue(int64 value, TensorProto* proto) {
+    proto->mutable_int64_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<uint32> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_UINT32; }
+  static void AddValue(uint32 value, TensorProto* proto) {
+    proto->mutable_uint32_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<uint64> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_UINT64; }
+  static void AddValue(uint64 value, TensorProto* proto) {
+    proto->mutable_uint64_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<float> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_FLOAT; }
+  static void AddValue(float value, TensorProto* proto) {
+    proto->mutable_float_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<double> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_DOUBLE; }
+  static void AddValue(double value, TensorProto* proto) {
+    proto->mutable_double_val()->Add(value);
+  }
+};
+
+template <>
+class TensorProtoHelper<bool> : public std::true_type {
+ public:
+  static DataType GetDataType() { return DataType::DT_BOOL; }
+  static void AddValue(bool value, TensorProto* proto) {
+    proto->mutable_bool_val()->Add(value);
+  }
+};
+}  // namespace internal
+
+// Creates a 'TensorProto' with specified shape and values.
+// The dtype and a field to represent data values of the returned 'TensorProto'
+// are determined based on type of the 'values' parameter.
+template <typename Type>
+typename std::enable_if<internal::TensorProtoHelper<Type>::value,
+                        TensorProto>::type
+CreateTensorProto(const std::vector<Type>& values,
+                  const std::vector<size_t>& shape) {
+  TensorProto tensor;
+  using TypeHelper = internal::TensorProtoHelper<Type>;
+  tensor.set_dtype(TypeHelper::GetDataType());
+  internal::SetTensorProtoShape(shape, tensor.mutable_tensor_shape());
+  for (const auto& value : values) {
+    TypeHelper::AddValue(value, &tensor);
+  }
+  return tensor;
+}
+
 }  // namespace tensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/framework/tensor_util_test.cc b/tensorflow/core/framework/tensor_util_test.cc
index 69eb8363b2..2b4e1cad2f 100644
--- a/tensorflow/core/framework/tensor_util_test.cc
+++ b/tensorflow/core/framework/tensor_util_test.cc
@@ -226,5 +226,145 @@ TEST(TensorUtil, ConcatSplitStrings) {
   }
 }
 
+TEST(TensorProtoUtil, CreatesStringTensorProto) {
+  std::vector<string> values{"a", "b", "c"};
+  std::vector<size_t> shape{1, 3};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_STRING\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 1\n"
+            "  }\n"
+            "  dim {\n"
+            "    size: 3\n"
+            "  }\n"
+            "}\n"
+            "string_val: \"a\"\n"
+            "string_val: \"b\"\n"
+            "string_val: \"c\"\n");
+}
+
+TEST(TensorProtoUtil, CreatesInt32TensorProto) {
+  std::vector<int32> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_INT32\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "int_val: 1\n"
+            "int_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesInt64TensorProto) {
+  std::vector<int64> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_INT64\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "int64_val: 1\n"
+            "int64_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesUInt32TensorProto) {
+  std::vector<uint32> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_UINT32\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "uint32_val: 1\n"
+            "uint32_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesUInt64TensorProto) {
+  std::vector<uint64> values{1, 2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_UINT64\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "uint64_val: 1\n"
+            "uint64_val: 2\n");
+}
+
+TEST(TensorProtoUtil, CreatesFloatTensorProto) {
+  std::vector<float> values{1.1, 2.2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_FLOAT\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "float_val: 1.1\n"
+            "float_val: 2.2\n");
+}
+
+TEST(TensorProtoUtil, CreatesDoubleTensorProto) {
+  std::vector<double> values{1.1, 2.2};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_DOUBLE\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "double_val: 1.1\n"
+            "double_val: 2.2\n");
+}
+
+TEST(TensorProtoUtil, CreatesBoolTensorProto) {
+  std::vector<bool> values{true, false};
+  std::vector<size_t> shape{2};
+
+  auto proto = tensor::CreateTensorProto(values, shape);
+
+  EXPECT_EQ(proto.DebugString(),
+            "dtype: DT_BOOL\n"
+            "tensor_shape {\n"
+            "  dim {\n"
+            "    size: 2\n"
+            "  }\n"
+            "}\n"
+            "bool_val: true\n"
+            "bool_val: false\n");
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From 2c75dbfd2d37a3c06d34cc4b12682a63a75503f7 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Tue, 29 May 2018 18:10:27 -0700
Subject: [PATCH 033/610] Making RPC op handle the case where cancellation
 manager is not initialized in OpKernelContext.

Fixes #19496

PiperOrigin-RevId: 198488860
---
 tensorflow/core/util/rpc/call_container.h | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/tensorflow/core/util/rpc/call_container.h b/tensorflow/core/util/rpc/call_container.h
index e1226a7f16..39ead10815 100644
--- a/tensorflow/core/util/rpc/call_container.h
+++ b/tensorflow/core/util/rpc/call_container.h
@@ -102,7 +102,9 @@ CallContainer<Call>::CallContainer(
     typename CallContainer<Call>::StartCallFn start_call_fn)
     : ctx_(ctx),
       done_(std::move(done)),
-      token_(ctx->cancellation_manager()->get_cancellation_token()),
+      token_(ctx->cancellation_manager() != nullptr
+                 ? ctx->cancellation_manager()->get_cancellation_token()
+                 : CancellationManager::kInvalidToken),
       fail_fast_(fail_fast),
       try_rpc_(try_rpc),
       callback_destroyed_(new Notification) {
@@ -110,7 +112,9 @@ CallContainer<Call>::CallContainer(
 
   // This will run when all RPCs are finished.
   reffed_status_callback_ = new ReffedStatusCallback([this](const Status& s) {
-    ctx_->cancellation_manager()->DeregisterCallback(token_);
+    if (token_ != CancellationManager::kInvalidToken) {
+      ctx_->cancellation_manager()->DeregisterCallback(token_);
+    }
     ctx_->SetStatus(s);
     done_();
     callback_destroyed_->WaitForNotification();
@@ -125,11 +129,14 @@ CallContainer<Call>::CallContainer(
   std::shared_ptr<internal::NotifyWhenDestroyed> notify_when_destroyed(
       new internal::NotifyWhenDestroyed(callback_destroyed_));
   std::shared_ptr<Notification> calls_started(new Notification);
-  bool is_cancelled = !ctx_->cancellation_manager()->RegisterCallback(
-      token_, [this, calls_started, notify_when_destroyed]() {
-        calls_started->WaitForNotification();
-        StartCancel();
-      });
+  bool is_cancelled = false;
+  if (token_ != CancellationManager::kInvalidToken) {
+    is_cancelled = !ctx_->cancellation_manager()->RegisterCallback(
+        token_, [this, calls_started, notify_when_destroyed]() {
+          calls_started->WaitForNotification();
+          StartCancel();
+        });
+  }
 
   for (int i = 0; i < num_calls; ++i) {
     create_call_fn(this, i);
-- 
GitLab


From 02ba49573008c22758fb90c8e26dde24406c1584 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Tue, 29 May 2018 18:17:19 -0700
Subject: [PATCH 034/610] Remove unnecessary shape registration fn from cudnn
 rnn ops.

The registered ones are the same as default.

PiperOrigin-RevId: 198489529
---
 tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index ed0a26bbd8..8822a7523f 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 import os
 from tensorflow.contrib.checkpoint.python import split_dependency
 from tensorflow.contrib.rnn.python.ops import lstm_ops
-from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
@@ -1647,10 +1646,3 @@ class CudnnRNNRelu(_CudnnRNNNoInputC):
   # 1 set of weight and bias parameters for the recurrent input, and 1 for the
   # previous layer input.
   _NUM_PARAMS_PER_LAYER = CUDNN_RNN_RELU_PARAMS_PER_LAYER
-
-
-ops.RegisterShape("CudnnRNNParamsSize")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNNParamsToCanonical")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNNCanonicalToParams")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNN")(common_shapes.call_cpp_shape_fn)
-ops.RegisterShape("CudnnRNNBackprop")(common_shapes.call_cpp_shape_fn)
-- 
GitLab


From 28cec60df3397ed16c9897a2d1e26eea622ad3be Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 29 May 2018 19:07:32 -0700
Subject: [PATCH 035/610] [XLA] Minor HloSharding cleanups.

Delete dead code in HloSharding::ToString(), and add and use proper
hasher struct.

PiperOrigin-RevId: 198493972
---
 tensorflow/compiler/xla/service/hlo_graph_dumper.cc | 8 ++++----
 tensorflow/compiler/xla/service/hlo_sharding.cc     | 3 ---
 tensorflow/compiler/xla/service/hlo_sharding.h      | 9 +++++++++
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index a2cb21c09b..efdeb6c64f 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -427,7 +427,8 @@ class HloDotDumper {
 
   // When coloring by sharding information, we track the sharding string
   // representation to color association, by round-robin the color schemes.
-  std::unordered_map<string, ColorScheme> sharding_colors_;
+  std::unordered_map<HloSharding, ColorScheme, HloSharding::Hasher>
+      sharding_colors_;
   int64 next_shard_color_ = 0;
 };
 
@@ -882,14 +883,13 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     if (!instr->has_sharding()) {
       return kDashedBorder;
     }
-    string shard_str = instr->sharding().ToString();
-    auto it = sharding_colors_.find(shard_str);
+    auto it = sharding_colors_.find(instr->sharding());
     if (it != sharding_colors_.end()) {
       return it->second;
     }
     ColorScheme color = static_cast<ColorScheme>(
         kBlue + (next_shard_color_++ % (kDashedBorder - kBlue)));
-    sharding_colors_.emplace(shard_str, color);
+    sharding_colors_.emplace(instr->sharding(), color);
     return color;
   }
   const auto kParameterColor = kOrange;
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 7f7e3f7dab..7708422ce1 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -49,9 +49,6 @@ string HloSharding::ToString() const {
     return StrCat("{", tensorflow::str_util::Join(parts, ", "), "}");
   }
 
-  string result = StrCat("{", (replicated_ ? " replicated" : ""),
-                         (maximal_ ? " maximal" : ""));
-
   if (replicated_) {
     return "{replicated}";
   } else if (maximal_) {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index 2b8e757f42..e8bb06c8f7 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -99,6 +99,9 @@ class HloSharding {
   static bool IsReservedDevice(int64 device) { return device < 0; }
 
   OpSharding ToProto() const;
+
+  // Note that this string canonically has outer curly braces, e.g.
+  // "{replicated}".
   string ToString() const;
 
   // Validate that this sharding can be applied to a tensor with shape `shape`.
@@ -208,6 +211,12 @@ class HloSharding {
     return h;
   }
 
+  struct Hasher {
+    size_t operator()(const HloSharding& sharding) const {
+      return sharding.Hash();
+    }
+  };
+
   // Gets the tile shape.
   // REQUIRES: !IsTileMaximal() && !IsTuple()
   const Shape& tile_shape() const { return tile_shape_; }
-- 
GitLab


From a364bc51405c0dbebe97c723fba8f877696205cc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 19:50:19 -0700
Subject: [PATCH 036/610] Do not allow cross computation instruction lookups in
 HLO parser.

PiperOrigin-RevId: 198496653
---
 .../compiler/xla/tools/parser/hlo_parser.cc   |  1 +
 .../xla/tools/parser/hlo_parser_test.cc       | 36 +++++++++++--------
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index e990b6aba8..76c870bc98 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -389,6 +389,7 @@ bool HloParser::ParseComputation(HloComputation** entry_computation) {
     }
     *entry_computation = computation;
   }
+  instruction_pool_.clear();
 
   return AddComputation(name, computation, name_loc);
 }
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 131aded95a..183b1121cd 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -1314,21 +1314,6 @@ ENTRY consts {
                   "one computation should have only one ROOT");
 }
 
-TEST_F(HloParserTest, InstructionExists) {
-  const string original = R"(HloModule comp_exists
-c1 {
-  instr = f32[1]{0} constant({12345})
-}
-c2 {
-  instr = f32[1]{0} constant({67890})
-})";
-
-  ExpectHasSubstr(Parse(original).status().error_message(),
-                  R"(was parsing 3:3: error: instruction previously defined here
-  instr = f32[1]{0} constant({12345})
-  ^)");
-}
-
 TEST_F(HloParserTest, ComputationExists) {
   const string original = R"(HloModule comp_exists
 comp {
@@ -1343,6 +1328,27 @@ comp {
 ^)");
 }
 
+TEST_F(HloParserTest, CrossComputationLookup) {
+  const string original = R"(HloModule cross_computation_lookup:
+tcalla (a: (s32[], s32[])) -> (s32[], s32[]) {
+  ROOT aparam = (s32[], s32[]) parameter(0)
+}
+
+tcallb (b: (s32[], s32[])) -> s32[] {
+  rparam = (s32[], s32[]) parameter(0)
+  ROOT gte0 = s32[] get-tuple-element(aparam), index=0
+}
+
+ENTRY entry {
+  param = (s32[], s32[]) parameter(0)
+  call0 = (s32[], s32[]) call(param), to_apply=tcalla
+  ROOT call1 = s32[] call(param), to_apply=tcallb
+})";
+  ExpectHasSubstr(
+      Parse(original).status().error_message(),
+      "was parsing 8:39: error: instruction does not exist: aparam");
+}
+
 }  // namespace
 }  // namespace tools
 }  // namespace xla
-- 
GitLab


From 9845e6ba999e623a7206914f90e702b45c4e6a7c Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 29 May 2018 20:59:21 -0700
Subject: [PATCH 037/610] Fix wiring issues due to shared inputs and outputs

---
 .../contrib/tensorrt/convert/convert_graph.cc | 60 +++++++++-------
 .../contrib/tensorrt/convert/convert_nodes.cc | 69 +++++++++++++------
 2 files changed, 82 insertions(+), 47 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index b7b26cfb1c..5f79f6d108 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -91,8 +91,11 @@ void GetSubGraphIncomingEdges(const tensorflow::Graph& graph,
       if (!subgraph_node_ids.count(edge->src()->id()) &&
           !edge->src()->IsSource() && !edge->IsControlEdge()) {
         incoming_edges->insert(edge);
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " Y, ";
       } else {
-        VLOG(2) << node->name() << " -> " << edge->src()->name() << " N, ";
+        VLOG(2) << "INCOMING " << edge->src()->name() << " -> " << node->name()
+                << " N, ";
       }
     }
   }
@@ -106,10 +109,12 @@ void GetSubGraphOutgoingEdges(const tensorflow::Graph& graph,
     for (const tensorflow::Edge* edge : node->out_edges()) {
       if (!subgraph_node_ids.count(edge->dst()->id()) &&
           !edge->dst()->IsSink() && !edge->IsControlEdge()) {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " Y, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " Y, ";
         outgoing_edges->insert(edge);
       } else {
-        VLOG(2) << node->name() << " -> " << edge->dst()->name() << " N, ";
+        VLOG(2) << "OUTGOING " << node->name() << " -> " << edge->dst()->name()
+                << " N, ";
       }
     }
   }
@@ -181,29 +186,21 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
-    p->subgraph_inputs.push_back({edge->src()->id(), edge->src_output()});
-  }
-  auto output_name_to_index_map = BuildTensorNameMap(p->output_names);
-  std::set<std::pair<int, int>> subgraph_outputs_set;
-  // Collect outputs referenced from output_names
-  for (int node_id : p->subgraph_node_ids) {
-    tensorflow::Node* node = p->graph.FindNodeId(node_id);
-    if (output_name_to_index_map.count(node->name())) {
-      for (int index : output_name_to_index_map.at(node->name())) {
-        subgraph_outputs_set.insert({node_id, index});
-      }
-    }
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
+  p->subgraph_inputs.insert(p->subgraph_inputs.begin(), unique_tensors.begin(),
+                            unique_tensors.end());
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
+  unique_tensors.clear();
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
-    subgraph_outputs_set.insert({edge->src()->id(), edge->src_output()});
+    unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
-  p->subgraph_outputs.reserve(subgraph_outputs_set.size());
+  p->subgraph_outputs.reserve(unique_tensors.size());
   p->subgraph_outputs.insert(p->subgraph_outputs.begin(),
-                             subgraph_outputs_set.begin(),
-                             subgraph_outputs_set.end());
+                             unique_tensors.begin(), unique_tensors.end());
   return tensorflow::Status::OK();
 }
 
@@ -257,19 +254,24 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
   for (size_t i = 0; i < params->subgraph_inputs.size(); ++i) {
     subgraph_edge_to_input_map.insert({params->subgraph_inputs.at(i), i});
   }
+  std::set<std::pair<int, int>> unique_tensors;
   for (const tensorflow::Edge* edge : params->subgraph_incoming_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
+    if (unique_tensors.count(old_src)) continue;
+    unique_tensors.insert(old_src);
     int new_src_output = subgraph_edge_to_input_map.at(old_src);
     params->graph.AddEdge(edge->src(), edge->src_output(), trt_node,
                           new_src_output);
+    VLOG(1) << "Wire " << edge->src()->name() << ":" << edge->src_output()
+            << " -> " << trt_node->name() << ":" << new_src_output;
     params->graph.RemoveEdge(edge);
   }
-
-  VLOG(2) << "new wiring edges: " << trt_node->in_edges().size();
-  for (const tensorflow::Edge* edge : trt_node->in_edges()) {
-    VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "new edge count: " << trt_node->in_edges().size();
+    for (const tensorflow::Edge* edge : trt_node->in_edges()) {
+      VLOG(2) << edge->src()->name() << " port: " << edge->src_output();
+    }
   }
-
   TF_RETURN_IF_ERROR(status);
 
   // Re-map outgoing edges to use the new TRT node instead of the orig subgraph
@@ -278,11 +280,14 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
   }
   TF_RETURN_IF_ERROR(status);
+  unique_tensors.clear();
   for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
     TF_RETURN_IF_ERROR(params->graph.UpdateEdge(
         trt_node, new_src_output, edge->dst(), edge->dst_input()));
+    VLOG(1) << "Wire " << trt_node->name() << ":" << new_src_output << " -> "
+            << edge->dst()->name() << ":" << edge->dst_input();
   }
   // Remove the original subgraph
   for (int node_id : params->subgraph_node_ids) {
@@ -317,9 +322,12 @@ tensorflow::Status ConvertCalibGraphToInferGraph(
       tensorflow::GraphConstructorOptions(), graph_def, &graph));
   //  get calib nodes
   std::vector<tensorflow::Node*> calib_nodes;
-  for (auto node : graph.op_nodes()) {
+  std::vector<tensorflow::Node*> topo_order;
+  tensorflow::GetPostOrder(graph, &topo_order);
+  for (auto rit = topo_order.rbegin(); rit != topo_order.rend(); ++rit) {
+    auto node = *rit;
     if (node->type_string() == "TRTCalibOp") {
-      VLOG(1) << "Found Calib Node";
+      VLOG(1) << "Found Calib Node " << node->name();
       calib_nodes.push_back(node);
     }
   }
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 32b211dcd1..16bfcc32a3 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -362,10 +362,11 @@ void ReorderCKtoKC(const TRT_ShapedWeights& iweights,
       break;
     }
     case tensorflow::DataType::DT_HALF: {
-      Reorder2({k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
-               istrides, static_cast<Eigen::half*>(
-                             const_cast<void*>(oweights->GetValues())),
-               ostrides);
+      Reorder2(
+          {k, c}, static_cast<Eigen::half const*>(iweights.GetValues()),
+          istrides,
+          static_cast<Eigen::half*>(const_cast<void*>(oweights->GetValues())),
+          ostrides);
       break;
     }
     default:
@@ -1179,9 +1180,9 @@ tensorflow::Status BinaryTensorOpTensor(
   CHECK_EQ_TYPE(tensor_r->getType(), dtype);
   auto op_pair = ops.find(node_def.op());
   if (op_pair == ops.end())
-    return tensorflow::errors::Unimplemented("binary op: " + node_def.op() +
-                                             " not supported at: " +
-                                             node_def.name());
+    return tensorflow::errors::Unimplemented(
+        "binary op: " + node_def.op() +
+        " not supported at: " + node_def.name());
 
   nvinfer1::IElementWiseLayer* layer = ctx.network()->addElementWise(
       *const_cast<nvinfer1::ITensor*>(tensor_l),
@@ -2138,9 +2139,7 @@ void Converter::register_op_converters() {
 }
 
 }  // namespace
-tensorflow::Status GetTensorRTGraph(tensorrt::convert::SubGraphParams& s) {
-  return tensorflow::errors::Unimplemented("Not implemented yet");
-}
+
 tensorflow::Status ConvertCalibrationNodeToEngineNode(
     tensorflow::Graph& graph, tensorflow::Node* c_node) {
   const auto ndef = c_node->def();
@@ -2164,9 +2163,23 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   for (auto n : graph.op_nodes()) {
     node_maps.insert({n->name(), n});
   }
-  VLOG(1) << "Output Nodes:";
+  std::set<int> subgraph_ids;
+  for (const auto internal_node : segment_nodes) {
+    subgraph_ids.insert(node_maps.at(internal_node)->id());
+  }
+  if (VLOG_IS_ON(2)) {
+    string node_names = StrCat(c_node->name(), " segment nodes= ");
+
+    for (const auto& node_name : segment_nodes) {
+      StrAppend(&node_names, node_name, ", ");
+    }
+    VLOG(2) << node_names;
+  }
+
+  VLOG(0) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
+
   for (auto& i : output_nodes) {
     auto node_port = tensorflow::str_util::Split(i, ":");
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
@@ -2186,9 +2199,13 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         out_types.push_back(out_node->output_type(0));
       }
       for (auto out_edge : out_node->out_edges()) {
+        if (subgraph_ids.count(out_edge->dst()->id()))
+          continue;  // skip internal edges;
         if (out_edge->src_output() == port) {
           out_edges.push_back(out_edge);
-          break;
+          VLOG(1) << "OUTPUT EDGE " << out_edge->src()->name() << ":"
+                  << out_edge->src_output() << " -> " << out_edge->dst()->name()
+                  << ":" << out_edge->dst_input();
         }
       }
     } else {
@@ -2255,13 +2272,18 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   }
   auto trt_engine_node = graph.AddNode(engine_node, &status);
   TF_RETURN_IF_ERROR(status);
-  for (size_t i = 0; i < out_edges.size(); i++) {
-    VLOG(1) << "Connecting trt_engine_node output " << i << " with "
-            << out_edges.at(i)->dst()->name() << " port "
-            << out_edges.at(i)->dst_input();
-    TF_RETURN_IF_ERROR(graph.UpdateEdge(trt_engine_node, i,
-                                        out_edges.at(i)->dst(),
-                                        out_edges.at(i)->dst_input()));
+  std::map<string, int> port_map;
+  for (size_t t = 0; t < output_nodes.size(); t++) {
+    port_map.insert({output_nodes.at(t), t});
+  }
+  for (auto& i : out_edges) {
+    string s(i->src()->name());
+    if (i->src_output()) StrAppend(&s, ":", i->src_output());
+    int out_port = port_map.at(s);
+    VLOG(1) << "Connecting " << trt_engine_node->name() << " port " << out_port
+            << " with " << i->dst()->name() << " port " << i->dst_input();
+    TF_RETURN_IF_ERROR(
+        graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
   }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
@@ -2332,6 +2354,7 @@ tensorflow::Status ConvertSubgraph(
     std::vector<string>* output_names,
     std::vector<tensorflow::DataType>* output_dtypes,
     const string& engine_name) {
+  std::set<string> added_tensors;
   for (const std::pair<int, int>& input : s.input_inds) {
     VLOG(2) << "parsing input. Node id= " << input.first;
     int node_id = input.first;
@@ -2374,7 +2397,6 @@ tensorflow::Status ConvertSubgraph(
 
     auto op_info = op_info_vec.at(shape_inference_output_idx);
     tensorflow::DataType tf_dtype = op_info.dtype();
-    input_dtypes->push_back(tf_dtype);
 
     nvinfer1::DataType dtype(nvinfer1::DataType::kFLOAT);
     auto type_status = ConvertDType(tf_dtype, &dtype);
@@ -2410,8 +2432,10 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0) {
       input_tensor_name = StrCat(node_name, ":", output_idx);
     }
-
+    if (added_tensors.count(input_tensor_name)) continue;
+    added_tensors.insert(input_tensor_name);
     input_names->push_back(input_tensor_name);
+    input_dtypes->push_back(tf_dtype);
     nvinfer1::ITensor* input_tensor = converter.network()->addInput(
         input_tensor_name.c_str(), dtype, input_dim_pseudo_chw);
 
@@ -2435,6 +2459,7 @@ tensorflow::Status ConvertSubgraph(
 
   // Gather output metadata
   int trt_engine_op_output_idx = 0;
+  added_tensors.clear();
   for (const std::pair<int, int>& output : s.output_inds) {
     int node_id = output.first;
     int output_idx = output.second;
@@ -2451,6 +2476,8 @@ tensorflow::Status ConvertSubgraph(
     if (output_idx != 0)
       tensorflow::strings::StrAppend(&tensor_name, ":", output_idx);
     VLOG(2) << "Output tensor name: " << tensor_name;
+    if (added_tensors.count(tensor_name)) continue;
+    added_tensors.insert(tensor_name);
     output_names->push_back(tensor_name);
     auto tensor_or_weights = converter.get_tensor(tensor_name);
     if (!tensor_or_weights.is_tensor()) {
-- 
GitLab


From 412a1b57d5764f0feabe2b6067273d298b6afd04 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 29 May 2018 21:00:22 -0700
Subject: [PATCH 038/610] Import tensorrt if available to
 import_pb_to_tensorboard.py for displaying TensorRT ops

---
 tensorflow/python/tools/import_pb_to_tensorboard.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index 00de044505..d1f9cd87b3 100755
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -29,6 +29,13 @@ from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary import summary
 
+# Try importing TensorRT ops if available
+# pylint: disable=unused-import,trailing-whitespace
+try:
+  import tensorflow.contrib.tensorrt as trt 
+except ImportError:
+  pass
+# pylint: enable=unused-import,trailing-whitespace
 
 def import_to_tensorboard(model_dir, log_dir):
   """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
-- 
GitLab


From 3f2ba2edf62dc394cfcb4b2606f1638389aa92e2 Mon Sep 17 00:00:00 2001
From: Bjarke Hammersholt Roune <broune@google.com>
Date: Tue, 29 May 2018 21:10:43 -0700
Subject: [PATCH 039/610] Add features to HloRunner for running while leaving
 buffers on the device and add option to test_utils for generating more-boring
 data much faster.

PiperOrigin-RevId: 198502753
---
 tensorflow/compiler/xla/service/hlo_runner.cc | 137 ++++++++++++------
 tensorflow/compiler/xla/service/hlo_runner.h  |  23 ++-
 tensorflow/compiler/xla/tests/test_utils.cc   |  35 +++--
 tensorflow/compiler/xla/tests/test_utils.h    |  18 ++-
 4 files changed, 150 insertions(+), 63 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 7127adf456..31e13da0c0 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -92,53 +92,58 @@ HloRunner::HloRunner(se::Platform* platform) {
 
 HloRunner::~HloRunner() {}
 
-StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
-    std::unique_ptr<HloModule> module,
-    const tensorflow::gtl::ArraySlice<Literal*> arguments, bool run_hlo_passes,
-    ExecutionProfile* profile) {
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      CreateExecutable(std::move(module), run_hlo_passes));
-  se::Stream stream(backend().default_stream_executor());
-  stream.Init();
-
-  ServiceExecutableRunOptions service_run_options(GetServiceRunOptionsForDevice(
-      backend().default_device_ordinal(), &stream, nullptr));
-  const ExecutableRunOptions& run_options = service_run_options.run_options();
+StatusOr<ScopedShapedBuffer> HloRunner::TransferLiteralToDevice(
+    const Literal& literal) {
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer,
+                      backend().transfer_manager()->AllocateScopedShapedBuffer(
+                          literal.shape(), backend().memory_allocator(),
+                          backend().default_device_ordinal()));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      backend().default_stream_executor(), literal, buffer));
+  return std::move(buffer);
+}
 
-  // Copy arguments to device.
-  std::vector<ScopedShapedBuffer> argument_buffers;
-  for (Literal* argument : arguments) {
-    TF_ASSIGN_OR_RETURN(
-        ScopedShapedBuffer argument_buffer,
-        backend().transfer_manager()->AllocateScopedShapedBuffer(
-            argument->shape(), run_options.allocator(),
-            run_options.device_ordinal()));
-    TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
-        stream.parent(), *argument, argument_buffer));
-    argument_buffers.push_back(std::move(argument_buffer));
+StatusOr<std::vector<ScopedShapedBuffer>> HloRunner::TransferLiteralsToDevice(
+    const tensorflow::gtl::ArraySlice<const Literal*> literals) {
+  std::vector<ScopedShapedBuffer> buffers;
+  for (const Literal* literal : literals) {
+    CHECK(literal != nullptr);
+    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer buffer,
+                        TransferLiteralToDevice(*literal));
+    buffers.push_back(std::move(buffer));
   }
+  return std::move(buffers);
+}
 
-  std::vector<const ShapedBuffer*> argument_buffer_ptrs;
-  argument_buffer_ptrs.reserve(argument_buffers.size());
-  for (const auto& buf : argument_buffers) {
-    argument_buffer_ptrs.push_back(&buf);
+StatusOr<std::vector<ScopedShapedBuffer>> HloRunner::TransferLiteralsToDevice(
+    const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> literals) {
+  std::vector<const Literal*> literal_pointers;
+  literal_pointers.reserve(literals.size());
+  for (const auto& literal : literals) {
+    literal_pointers.push_back(literal.get());
   }
+  return TransferLiteralsToDevice(literal_pointers);
+}
 
-  TF_ASSIGN_OR_RETURN(
-      ScopedShapedBuffer result,
-      executable->ExecuteOnStreamWrapper(
-          &service_run_options, /*profile=*/profile, argument_buffer_ptrs));
-
-  auto result_literal = backend().transfer_manager()->TransferLiteralFromDevice(
-      stream.parent(), result);
-  if (result_literal.ok()) {
-    VLOG(4) << "Executed binary and got result: "
-            << result_literal.ValueOrDie()->ToString();
-  } else {
-    VLOG(4) << "Executed binary and got status: "
-            << result_literal.status().ToString();
-  }
-  return result_literal;
+StatusOr<std::unique_ptr<Literal>> HloRunner::TransferLiteralFromDevice(
+    const ShapedBuffer& buffer) {
+  return backend().transfer_manager()->TransferLiteralFromDevice(
+      backend().default_stream_executor(), buffer);
+}
+
+StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<const Literal*> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  TF_ASSIGN_OR_RETURN(std::vector<ScopedShapedBuffer> argument_buffers,
+                      TransferLiteralsToDevice(arguments));
+  TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
+                      ExecuteWithDeviceBuffers(
+                          /*module=*/std::move(module),
+                          /*arguments=*/argument_buffers,
+                          /*run_hlo_passes=*/run_hlo_passes,
+                          /*profile=*/profile));
+  return TransferLiteralFromDevice(result);
 }
 
 StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
@@ -146,11 +151,49 @@ StatusOr<std::unique_ptr<Literal>> HloRunner::Execute(
     const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arguments,
     bool run_hlo_passes, ExecutionProfile* profile) {
   // Construct a vector of plain pointers for the arguments.
-  std::vector<Literal*> argument_pointers;
-  c_transform(
-      arguments, std::back_inserter(argument_pointers),
-      [](const std::unique_ptr<Literal>& literal) { return literal.get(); });
-  return Execute(std::move(module), argument_pointers, run_hlo_passes, profile);
+  std::vector<const Literal*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(argument.get());
+  }
+  return Execute(
+      /*module=*/std::move(module),
+      /*arguments=*/argument_pointers,
+      /*run_hlo_passes=*/run_hlo_passes,
+      /*profile=*/profile);
+}
+
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  // Get service run options.
+  se::Stream stream(backend().default_stream_executor());
+  stream.Init();
+  ServiceExecutableRunOptions service_run_options =
+      GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
+                                    nullptr);
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
+                      CreateExecutable(std::move(module), run_hlo_passes));
+  return executable->ExecuteOnStreamWrapper(&service_run_options,
+                                            /*profile=*/profile, arguments);
+}
+
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<HloModule> module,
+    const tensorflow::gtl::ArraySlice<ScopedShapedBuffer> arguments,
+    bool run_hlo_passes, ExecutionProfile* profile) {
+  std::vector<const ShapedBuffer*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return ExecuteWithDeviceBuffers(
+      /*module=*/std::move(module),
+      /*arguments=*/argument_pointers,
+      /*run_hlo_passes=*/run_hlo_passes,
+      /*profile=*/profile);
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> HloRunner::ExecuteReplicated(
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index aa62659ac3..65537f07f5 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -102,6 +102,15 @@ class HloRunner {
   static StatusOr<std::unique_ptr<HloModule>> ReadModuleFromHloTextFile(
       const std::string& filename, const DebugOptions& debug_options);
 
+  // Transfers data between the host and device.
+  StatusOr<ScopedShapedBuffer> TransferLiteralToDevice(const Literal& literal);
+  StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      const tensorflow::gtl::ArraySlice<const Literal*> literals);
+  StatusOr<std::vector<ScopedShapedBuffer>> TransferLiteralsToDevice(
+      const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> literals);
+  StatusOr<std::unique_ptr<Literal>> TransferLiteralFromDevice(
+      const ShapedBuffer& buffer);
+
   // Executes the given module with given literals as input and returns the
   // result as a Literal.
   //
@@ -109,7 +118,7 @@ class HloRunner {
   // optimization.
   StatusOr<std::unique_ptr<Literal>> Execute(
       std::unique_ptr<HloModule> module,
-      const tensorflow::gtl::ArraySlice<Literal*> arguments,
+      const tensorflow::gtl::ArraySlice<const Literal*> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
   StatusOr<std::unique_ptr<Literal>> Execute(
@@ -117,6 +126,18 @@ class HloRunner {
       const tensorflow::gtl::ArraySlice<std::unique_ptr<Literal>> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  // As Execute(), but accepts and returns device buffers instead of host
+  // buffers.
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
+
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<HloModule> module,
+      const tensorflow::gtl::ArraySlice<ScopedShapedBuffer> arguments,
+      bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
+
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
   // value.
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index de18651388..dd7c541733 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -26,6 +26,7 @@ namespace {
 template <typename FloatT, typename GeneratorT>
 void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
                                              std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   // Create uniform numbers between 1 and 1.125 to avoid creating denormal
@@ -59,12 +60,14 @@ void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
 template <typename FloatT>
 void PopulateWithRandomFloatingPointData(Literal* literal,
                                          std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine);
 }
 
 template <>
 void PopulateWithRandomFloatingPointData<half>(Literal* literal,
                                                std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   PopulateWithRandomFloatingPointDataImpl<half, float>(literal, engine);
 }
 
@@ -73,6 +76,7 @@ void PopulateWithRandomFloatingPointData<half>(Literal* literal,
 template <>
 void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
                                                    std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(), BF16);
   std::uniform_real_distribution<float> generator(-0.9f, 1.0f);
   TF_CHECK_OK(literal->Populate<bfloat16>(
@@ -84,6 +88,7 @@ void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
 template <typename IntT>
 void PopulateWithRandomIntegralData(Literal* literal,
                                     std::minstd_rand0* engine) {
+  CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
   std::uniform_int_distribution<IntT> generator(
@@ -107,6 +112,9 @@ StatusOr<std::unique_ptr<Literal>> MakeFakeLiteralInternal(
     }
     return Literal::MakeTupleOwned(std::move(elements));
   }
+  if (engine == nullptr) {
+    return Literal::CreateFromShape(shape);
+  }
   auto literal = MakeUnique<Literal>(shape);
   switch (shape.element_type()) {
     case BF16:
@@ -201,11 +209,13 @@ std::unique_ptr<Literal> MakeRandomNonwrappingSliceIndex(
     std::minstd_rand0* engine) {
   const int64 rank = ShapeUtil::Rank(input_shape);
   std::vector<int32> start_indices(rank);
-  for (int i = 0; i < rank; ++i) {
-    const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
-                              ShapeUtil::GetDimension(slice_shape, i);
-    std::uniform_int_distribution<int32> generator(0, upper_bound);
-    start_indices[i] = generator(*engine);
+  if (engine != nullptr) {
+    for (int i = 0; i < rank; ++i) {
+      const int32 upper_bound = ShapeUtil::GetDimension(input_shape, i) -
+                                ShapeUtil::GetDimension(slice_shape, i);
+      std::uniform_int_distribution<int32> generator(0, upper_bound);
+      start_indices[i] = generator(*engine);
+    }
   }
   return Literal::CreateR1<int32>(start_indices);
 }
@@ -321,20 +331,21 @@ StatusOr<std::unique_ptr<Literal>> MakeConstrainedArgument(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape) {
-  std::minstd_rand0 engine;
-  return MakeFakeLiteralInternal(shape, &engine);
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
+                                                   bool pseudo_random) {
+  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
+  return MakeFakeLiteralInternal(shape, engine.get());
 }
 
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    HloModule* const module) {
+    HloModule* const module, bool pseudo_random) {
   TF_ASSIGN_OR_RETURN(auto dataflow, HloDataflowAnalysis::Run(*module));
   const auto params = module->entry_computation()->parameter_instructions();
-  std::minstd_rand0 engine;
+  auto engine = pseudo_random ? MakeUnique<std::minstd_rand0>() : nullptr;
   std::vector<std::unique_ptr<Literal>> arguments(params.size());
   for (int i = 0; i < params.size(); ++i) {
-    TF_ASSIGN_OR_RETURN(
-        arguments[i], MakeConstrainedArgument(*dataflow, *params[i], &engine));
+    TF_ASSIGN_OR_RETURN(arguments[i], MakeConstrainedArgument(
+                                          *dataflow, *params[i], engine.get()));
   }
   return std::move(arguments);
 }
diff --git a/tensorflow/compiler/xla/tests/test_utils.h b/tensorflow/compiler/xla/tests/test_utils.h
index f483cdebea..a8689f6498 100644
--- a/tensorflow/compiler/xla/tests/test_utils.h
+++ b/tensorflow/compiler/xla/tests/test_utils.h
@@ -55,16 +55,28 @@ class PseudorandomGenerator {
 };
 
 // Generates fake data in a literal of the given shape, or returns an error
-// status if the element type is currently unhandled for fake data generation.
-StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape);
+// status if the element type is currently unhandled for fake data
+// generation. See below for documentation of pseudo_random.
+StatusOr<std::unique_ptr<Literal>> MakeFakeLiteral(const Shape& shape,
+                                                   bool pseudo_random = true);
 
 // Generates a vector of arguments containing fake data. The number, shape and
 // layout of the arguments is appropriate for given HLO module.
 //
 // Will handle special cases such as making sure that indices used for dynamic
 // slices are bounded, reduces that call adds use 0 as an init value, etc.
+//
+// If pseudo_random is true, the generated numbers will be generated
+// deterministically in a pseudo random way unless the values are constrated to
+// be e.g. init values as above. If pseudo_random is false, the returned values
+// will be generated in a faster way that yields less interesting data, e.g. the
+// values may all be just the same value.
+//
+// TODO(b/79942829): Make interesting argument generation fast enough that using
+// pseudo_random does not save any noticeable amount of time so that the
+// parameter can be removed.
 StatusOr<std::vector<std::unique_ptr<Literal>>> MakeFakeArguments(
-    HloModule* const module);
+    HloModule* const module, bool pseudo_random = true);
 
 // Check that a given module satisfies various constraints before trying to
 // execute it.
-- 
GitLab


From 9c509eedc3888d3846b2ab5ac2879268df9ff8cd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 21:24:36 -0700
Subject: [PATCH 040/610] Introduced kDomain HLO instruction set isolation to
 bound connected sets of instructions with similar attributes (ie, sharding).
 This CL simply adds the infrastructure, but leaves the wire-on to a separate
 CL.

PiperOrigin-RevId: 198503625
---
 tensorflow/compiler/xla/service/BUILD         |  77 ++++
 .../compiler/xla/service/dfs_hlo_visitor.h    |   4 +
 .../compiler/xla/service/hlo_clone_context.h  |  97 ++++
 .../compiler/xla/service/hlo_computation.cc   |  47 +-
 .../compiler/xla/service/hlo_computation.h    |  21 +-
 tensorflow/compiler/xla/service/hlo_cse.cc    |  23 +-
 .../compiler/xla/service/hlo_cse_test.cc      |  67 ++-
 .../xla/service/hlo_domain_isolator.cc        | 104 +++++
 .../xla/service/hlo_domain_isolator.h         |  56 +++
 .../compiler/xla/service/hlo_domain_map.cc    | 168 +++++++
 .../compiler/xla/service/hlo_domain_map.h     | 108 +++++
 .../xla/service/hlo_domain_metadata.h         |  83 ++++
 .../xla/service/hlo_domain_remover.cc         | 149 ++++++
 .../compiler/xla/service/hlo_domain_remover.h |  56 +++
 .../compiler/xla/service/hlo_domain_test.cc   | 432 ++++++++++++++++++
 .../xla/service/hlo_element_type_converter.cc |  11 +-
 .../compiler/xla/service/hlo_evaluator.cc     |   3 +-
 .../compiler/xla/service/hlo_graph_dumper.cc  |   1 +
 .../compiler/xla/service/hlo_instruction.cc   |  87 +++-
 .../compiler/xla/service/hlo_instruction.h    |  58 ++-
 .../xla/service/hlo_instruction_test.cc       |  48 ++
 tensorflow/compiler/xla/service/hlo_module.cc |  74 ++-
 tensorflow/compiler/xla/service/hlo_module.h  |   7 +-
 .../xla/service/hlo_module_group_metadata.cc  |  78 +++-
 .../xla/service/hlo_module_group_metadata.h   |  12 +
 tensorflow/compiler/xla/service/hlo_opcode.h  |   1 +
 .../compiler/xla/service/hlo_sharding.cc      |  25 +-
 .../compiler/xla/service/hlo_sharding.h       |  14 +-
 .../xla/service/hlo_sharding_metadata.cc      | 401 ++++++++++++++++
 .../xla/service/hlo_sharding_metadata.h       |  67 +++
 .../compiler/xla/service/hlo_verifier.cc      |   1 +
 .../xla/service/instruction_fusion.cc         |   1 +
 .../xla/service/logical_buffer_analysis.cc    |   6 +
 .../xla/service/logical_buffer_analysis.h     |   1 +
 .../compiler/xla/service/shape_inference.cc   |   3 +-
 .../xla/service/tuple_points_to_analysis.cc   |   8 +
 .../xla/service/tuple_points_to_analysis.h    |   1 +
 tensorflow/compiler/xla/shape_tree.h          |   3 +
 tensorflow/compiler/xla/shape_util.cc         |  21 +
 tensorflow/compiler/xla/shape_util.h          |  17 +
 .../compiler/xla/tools/parser/hlo_parser.cc   |   1 +
 41 files changed, 2252 insertions(+), 190 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_clone_context.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_isolator.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_isolator.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_map.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_map.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_metadata.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_remover.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_remover.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_domain_test.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_sharding_metadata.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 5472f9a637..7e4a75a6e3 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -273,7 +273,9 @@ cc_library(
     hdrs = [
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
+        "hlo_clone_context.h",
         "hlo_computation.h",
+        "hlo_domain_metadata.h",
         "hlo_instruction.h",
         "hlo_module.h",
         "hlo_opcode.h",
@@ -415,6 +417,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -2339,6 +2342,7 @@ cc_library(
     hdrs = ["hlo_cse.h"],
     deps = [
         ":hlo",
+        ":hlo_domain_map",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -2403,6 +2407,79 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_domain_map",
+    srcs = ["hlo_domain_map.cc"],
+    hdrs = ["hlo_domain_map.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "hlo_sharding_metadata",
+    srcs = ["hlo_sharding_metadata.cc"],
+    hdrs = [
+        "hlo_sharding_metadata.h",
+    ],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:shape_tree",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+cc_library(
+    name = "hlo_domain_isolator",
+    srcs = ["hlo_domain_isolator.cc"],
+    hdrs = ["hlo_domain_isolator.h"],
+    deps = [
+        ":hlo",
+        ":hlo_graph_dumper",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
+cc_library(
+    name = "hlo_domain_remover",
+    srcs = ["hlo_domain_remover.cc"],
+    hdrs = ["hlo_domain_remover.h"],
+    deps = [
+        ":hlo",
+        ":hlo_domain_isolator",
+        ":hlo_domain_map",
+        ":hlo_graph_dumper",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_domain_test",
+    srcs = ["hlo_domain_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_domain_isolator",
+        ":hlo_domain_remover",
+        ":hlo_sharding_metadata",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "hlo_element_type_converter",
     srcs = ["hlo_element_type_converter.cc"],
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index b9d7ec9c2e..64678d9d74 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -197,6 +197,10 @@ class DfsHloVisitorBase {
     return HandleElementwiseUnary(hlo);
   }
 
+  virtual Status HandleDomain(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
+
   virtual Status HandleInfeed(HloInstructionPtr hlo) = 0;
   virtual Status HandleOutfeed(HloInstructionPtr hlo) = 0;
   virtual Status HandleHostCompute(HloInstructionPtr hlo) = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_clone_context.h b/tensorflow/compiler/xla/service/hlo_clone_context.h
new file mode 100644
index 0000000000..658643b427
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_clone_context.h
@@ -0,0 +1,97 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+
+namespace xla {
+
+class HloInstruction;
+class HloComputation;
+class HloModule;
+
+// Data structure used to track the cloning of HloInstruction and HloComputation
+// objects.
+class HloCloneContext {
+ public:
+  // Creates a new HloCloneContext object to clone HloInstruction and
+  // HloComputation objects to be added to the module specified as argument.
+  // The suffix string will be appended to computation names.
+  explicit HloCloneContext(HloModule* module, const string& suffix = "")
+      : module_(module), suffix_(suffix) {}
+
+  HloModule* module() const { return module_; }
+
+  const string& suffix() const { return suffix_; }
+
+  void MapInstruction(const HloInstruction* old_instruction,
+                      HloInstruction* new_instruction) {
+    instructions_[old_instruction] = new_instruction;
+  }
+
+  void MapComputation(const HloComputation* old_computation,
+                      HloComputation* new_computation) {
+    computations_[old_computation] = new_computation;
+  }
+
+  // Finds the new instruction mapped to its old copy, or return nullptr in case
+  // it is not found.
+  HloInstruction* FindInstruction(const HloInstruction* old_instruction) const {
+    return FindOrDefault(instructions_, old_instruction, nullptr);
+  }
+
+  // Finds the new computation mapped to its old copy, or return nullptr in case
+  // it is not found.
+  HloComputation* FindComputation(const HloComputation* old_computation) const {
+    return FindOrDefault(computations_, old_computation, nullptr);
+  }
+
+  // Retrieves the new instruction mapped to its old copy, or fail if not found.
+  HloInstruction* GetInstruction(const HloInstruction* old_instruction) const {
+    return FindOrDie(instructions_, old_instruction);
+  }
+
+  // Retrieves the new computation mapped to its old copy, or fail if not found.
+  HloComputation* GetComputation(const HloComputation* old_computation) const {
+    return FindOrDie(computations_, old_computation);
+  }
+
+  const tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>&
+  cloned_instructions() const {
+    return instructions_;
+  }
+
+  const tensorflow::gtl::FlatMap<const HloComputation*, HloComputation*>&
+  cloned_computations() const {
+    return computations_;
+  }
+
+ private:
+  HloModule* module_;
+  string suffix_;
+  tensorflow::gtl::FlatMap<const HloInstruction*, HloInstruction*>
+      instructions_;
+  tensorflow::gtl::FlatMap<const HloComputation*, HloComputation*>
+      computations_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CLONE_CONTEXT_H_
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 63c3dc4a59..b61eabbbf5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -752,22 +752,21 @@ Status HloComputation::Accept(
 }
 
 std::unique_ptr<HloComputation> HloComputation::Clone(
-    const string& suffix, HloModule* module,
-    HloInstruction::CloneMap* clone_map) {
+    const string& suffix, HloCloneContext* context) {
   return CloneWithReplacements(
       /*replacements=*/std::unordered_map<const HloInstruction*,
                                           std::unique_ptr<HloInstruction>>(),
-      module, clone_map, suffix);
+      context, suffix);
 }
 
 std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
         replacements,
-    HloModule* module, HloInstruction::CloneMap* clone_map,
-    const string& suffix) {
-  HloInstruction::CloneMap local_clone_map;
-  if (clone_map == nullptr) {
-    clone_map = &local_clone_map;
+    HloCloneContext* context, const string& suffix) {
+  std::unique_ptr<HloCloneContext> context_ptr;
+  if (context == nullptr) {
+    context_ptr = MakeUnique<HloCloneContext>(parent(), suffix);
+    context = context_ptr.get();
   }
 
   // Look up instr in the replacements map, and return either the replacement,
@@ -792,18 +791,18 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
   }
 
   std::vector<std::unique_ptr<HloInstruction>> instructions;
-  std::unique_ptr<HloInstruction> new_instr = nullptr;
+  std::unique_ptr<HloInstruction> new_instr;
   for (auto instr : postorder) {
     std::vector<HloInstruction*> new_operands;
     for (auto operand : instr->operands()) {
       auto replaced_operand = replace(operand);
       CHECK_NE(replaced_operand, nullptr)
-          << "Replacements map specifies to leave out " << operand->ToString()
-          << ", but it is used by " << instr->ToString() << ".";
-      new_operands.push_back(FindOrDie(*clone_map, replaced_operand));
+          << "replacements map tried to eliminate a used instruction "
+          << operand->ToString() << ", used by " << instr->ToString();
+      new_operands.push_back(context->GetInstruction(replaced_operand));
     }
-    new_instr = instr->CloneWithNewOperands(instr->shape(), new_operands,
-                                            module, clone_map);
+    new_instr =
+        instr->CloneWithNewOperands(instr->shape(), new_operands, context);
     instructions.push_back(std::move(new_instr));
   }
   Builder builder(name() + "." + suffix);
@@ -811,22 +810,23 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     builder.AddInstruction(std::move(instr));
   }
   auto result = builder.Build(
-      /*root_instruction=*/FindOrDie(*clone_map, replace(root_instruction())));
+      /*root_instruction=*/context->GetInstruction(
+          replace(root_instruction())));
 
   // Clone control dependencies.
   for (auto instr : postorder) {
-    HloInstruction* new_instr = FindOrDie(*clone_map, instr);
+    HloInstruction* new_instr = context->GetInstruction(instr);
     for (auto successor : instr->control_successors()) {
       auto replaced_successor = replace(successor);
-      CHECK_NE(replaced_successor, nullptr)
-          << "Replacements map specifies to leave out " << successor->ToString()
-          << ", but it is control-depended-on by " << instr->ToString() << ".";
-
-      TF_CHECK_OK(new_instr->AddControlDependencyTo(
-          FindOrDie(*clone_map, replaced_successor)));
+      // successor may not have been remapped, because it might have been
+      // removed by the replacements map.
+      if (replaced_successor != nullptr) {
+        TF_CHECK_OK(new_instr->AddControlDependencyTo(
+            context->GetInstruction(replaced_successor)));
+      }
     }
   }
-
+  context->MapComputation(this, result.get());
   // We cloned the elements of 'replacements', so they're all going to be
   // destroyed. HloInstructions need to be detached from their operands before
   // they're destroyed, otherwise they stick around in the operands' users lists
@@ -836,7 +836,6 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
       new_instr->DetachFromOperands();
     }
   }
-
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index 8bc97df036..0da4a305f3 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_reachability.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -300,17 +301,11 @@ class HloComputation {
       const std::function<Status(const HloInstruction*)>& visitor_func) const;
 
   // Returns a deep copy of this computation including all instructions.
-  //
-  // If the module pointer is not nullptr, then the cloned computations will be
-  // added to this module in order to support deep cloning. Otherwise the module
-  // of the computation is used.
-  //
-  // If clone_map is not nullptr, then each original instruction that is cloned
-  // will be inserted and map to its clone. clone_map should not already contain
-  // any of the instructions to clone.
-  std::unique_ptr<HloComputation> Clone(
-      const string& suffix = "clone", HloModule* module = nullptr,
-      HloInstruction::CloneMap* clone_map = nullptr);
+  // If the clone context is specified, it will be populated with the cloned
+  // object mappings, and its module() will be used to add new computations
+  // into.
+  std::unique_ptr<HloComputation> Clone(const string& suffix = "clone",
+                                        HloCloneContext* context = nullptr);
 
   // Like Clone(), but if an instruction is present in replacement_map, we use
   // the map's value to replace that instruction in the cloned computation.
@@ -320,9 +315,7 @@ class HloComputation {
   std::unique_ptr<HloComputation> CloneWithReplacements(
       std::unordered_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
           replacements,
-      HloModule* module = nullptr,
-      HloInstruction::CloneMap* clone_map = nullptr,
-      const string& suffix = "clone");
+      HloCloneContext* context = nullptr, const string& suffix = "clone");
 
   // Returns true if the given instruction can be removed from the computation.
   // Parameter instructions cannot be removed without violating invariants of
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index c17c26c5a4..dab946a099 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -41,16 +42,16 @@ namespace {
 
 // Find and combine identical constants. Constants are identical if they have
 // the same type and value.
-bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
-  bool changed = false;
-
+StatusOr<bool> CombineConstants(HloComputation* computation,
+                                bool is_layout_sensitive) {
+  TF_ASSIGN_OR_RETURN(auto domain_map, HloDomainMap::Create(computation, ""));
   // Map from ShortDebugString of the layoutless shape of the constant to the
   // set of constant instructions with that shape. Layoutless shape is used to
   // bin possible common constants together to reduce number of constant
   // comparisons. If we end up having too many constant comparisons, a more
   // precise binning might have to be used.
   std::multimap<string, HloInstruction*> constants;
-
+  int64 combined = 0;
   auto inst_it = computation->instructions().begin();
   while (inst_it != computation->instructions().end()) {
     HloInstruction* instruction = *inst_it;
@@ -70,7 +71,8 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
       auto range = constants.equal_range(shape_string);
       HloInstruction* match = nullptr;
       for (auto it = range.first; it != range.second; ++it) {
-        if (instruction->literal() == it->second->literal()) {
+        if (instruction->literal() == it->second->literal() &&
+            domain_map->InSameDomain(it->second, instruction)) {
           match = it->second;
           break;
         }
@@ -81,12 +83,13 @@ bool CombineConstants(HloComputation* computation, bool is_layout_sensitive) {
         // Match found, replace this instruction with the one in the multimap.
         TF_CHECK_OK(instruction->ReplaceAllUsesWith(match));
         TF_CHECK_OK(computation->RemoveInstruction(instruction));
-        changed = true;
+        ++combined;
       }
     }
   }
-
-  return changed;
+  VLOG(4) << "Combined " << combined << " constants in " << computation->name()
+          << " computation";
+  return combined > 0;
 }
 
 // An instruction is considered to be equivalent to another only if they
@@ -123,7 +126,9 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
       continue;
     }
 
-    changed |= CombineConstants(computation, is_layout_sensitive_);
+    TF_ASSIGN_OR_RETURN(bool combined,
+                        CombineConstants(computation, is_layout_sensitive_));
+    changed |= combined;
 
     // HLO instructions are grouped into equivalency classes by using the
     // cse_equal predicate defined above. This set holds a representative
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index 9735764b69..e8c5ca347b 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -142,31 +142,46 @@ TEST_F(HloCseTest, ConstantsSameValueDifferentType) {
   // Test that constants with the same value but different type are *not*
   // commoned.
   auto builder = HloComputation::Builder(TestName());
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<uint64>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<double>(42.0)));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  std::vector<HloInstruction*> constants;
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint64>(42.0))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int64>(42.0))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<double>(42.0))));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f))));
   // Duplicate the float constant to verify something happens.
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
+  constants.push_back(builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f))));
+
+  const Shape shape_r0 = ShapeUtil::MakeShape(F32, {});
+  for (int64 i = 0; i < constants.size(); ++i) {
+    constants[i] = builder.AddInstruction(
+        HloInstruction::CreateConvert(shape_r0, constants[i]));
+  }
+  HloInstruction* root = builder.AddInstruction(HloInstruction::CreateBinary(
+      shape_r0, HloOpcode::kAdd, constants[0], constants[1]));
+  for (int64 i = 2; i < constants.size(); ++i) {
+    root = builder.AddInstruction(HloInstruction::CreateBinary(
+        shape_r0, HloOpcode::kAdd, root, constants[i]));
+  }
 
   auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_EQ(7, computation->instruction_count());
+  EXPECT_EQ(20, computation->instruction_count());
 
   HloCSE cse(/*is_layout_sensitive=*/false);
   EXPECT_TRUE(cse.Run(module.get()).ValueOrDie());
 
-  EXPECT_EQ(6, computation->instruction_count());
+  // CSE will remove both the second float(42.0f) and the corresponding
+  // convert/cast.
+  EXPECT_EQ(18, computation->instruction_count());
 }
 
 TEST_F(HloCseTest, NonscalarConstants) {
@@ -501,5 +516,25 @@ TEST_F(HloCseTest, CompareComputations) {
   EXPECT_EQ(root->operand(0), root->operand(1));
 }
 
+TEST_F(HloCseTest, ConstantsSameValueInDifferentDomains) {
+  // Test that constants with the same value but in different domains (disjoint
+  // in this case) are not collapsed.
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<uint32>(42)));
+
+  auto module = CreateNewModule();
+  auto computation = module->AddEntryComputation(builder.Build());
+
+  EXPECT_EQ(2, computation->instruction_count());
+
+  HloCSE cse(/*is_layout_sensitive=*/false);
+  EXPECT_FALSE(cse.Run(module.get()).ValueOrDie());
+
+  EXPECT_EQ(2, computation->instruction_count());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.cc b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
new file mode 100644
index 0000000000..78955db0da
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.cc
@@ -0,0 +1,104 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+class HloDomainIsolator::RunContext {
+ public:
+  RunContext(HloModule* module, HloDomainIsolator* isolator)
+      : module_(module), isolator_(isolator) {}
+
+  StatusOr<bool> Run();
+
+ private:
+  // Inserts a kDomain instruction between parent and operand, in case
+  // the attribute (ie, sharding) values change between instruction and operand.
+  // Returns the newly inserted kDomain instruction, or nullptr if no kDomain
+  // instruction was necessary.
+  StatusOr<HloInstruction*> CreateDomain(HloInstruction* instruction,
+                                         HloInstruction* parent,
+                                         HloInstruction* operand);
+
+  HloModule* module_;
+  HloDomainIsolator* isolator_;
+};
+
+StatusOr<HloInstruction*> HloDomainIsolator::RunContext::CreateDomain(
+    HloInstruction* instruction, HloInstruction* parent,
+    HloInstruction* operand) {
+  HloInstruction* domain = nullptr;
+  std::unique_ptr<HloInstruction> domain_instruction =
+      isolator_->creator_(instruction, operand);
+  if (domain_instruction != nullptr) {
+    domain = operand->parent()->AddInstruction(std::move(domain_instruction));
+    TF_RETURN_IF_ERROR(operand->ReplaceUseWith(parent, domain));
+  }
+  return domain;
+}
+
+StatusOr<bool> HloDomainIsolator::RunContext::Run() {
+  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Isolator");
+
+  int64 added_domains = 0;
+  for (HloComputation* computation : module_->computations()) {
+    // Walk in post order and place all the required kDomain instructions.
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kDomain) {
+        continue;
+      }
+      for (HloInstruction* operand : instruction->unique_operands()) {
+        // When applying multiple domains, we could end up stacking more than
+        // one in one edge, so here we want to build the effective
+        // (kDomain-less) instruction->operand edge.
+        HloInstruction* parent = instruction;
+        while (operand->opcode() == HloOpcode::kDomain) {
+          parent = operand;
+          operand = operand->mutable_operand(0);
+        }
+        // Check whether a kDomain is necessary between instruction and operand.
+        TF_ASSIGN_OR_RETURN(HloInstruction * domain,
+                            CreateDomain(instruction, parent, operand));
+        if (domain != nullptr) {
+          VLOG(4) << "New domain: " << domain->ToString();
+          ++added_domains;
+        }
+      }
+    }
+  }
+  VLOG(3) << "Added " << added_domains << " kDomain instructions";
+  if (added_domains > 0) {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Isolator");
+  }
+  return added_domains > 0;
+}
+
+HloDomainIsolator::HloDomainIsolator(DomainCreator creator)
+    : creator_(std::move(creator)) {}
+
+StatusOr<bool> HloDomainIsolator::Run(HloModule* module) {
+  RunContext run_context(module, this);
+  return run_context.Run();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_isolator.h b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
new file mode 100644
index 0000000000..e0c5718509
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_isolator.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Domain isolation is the task of placing kDomain instructions between HLO
+// instructions having different shrading. A kDomain instruction is essentially
+// used to break an HLO graph edge connecting two instructions with different
+// sharding. If a set of connected instructions have all the same sharding, no
+// kDomain instruciton will be placed.
+class HloDomainIsolator : public HloPassInterface {
+ public:
+  // Creates a new kDomain instruction for the edge between the use instruction
+  // (the first HloInstruction argument), and the operand instruction (the
+  // second HloInstruction argument).
+  // Returns nullptr in case no domain separation is necessary.
+  using DomainCreator = std::function<std::unique_ptr<HloInstruction>(
+      HloInstruction*, HloInstruction*)>;
+
+  explicit HloDomainIsolator(DomainCreator creator);
+
+  tensorflow::StringPiece name() const override { return "domain_isolator"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  class RunContext;
+
+  DomainCreator creator_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_ISOLATOR_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
new file mode 100644
index 0000000000..acb54c260c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -0,0 +1,168 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
+
+#include <algorithm>
+
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+/* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
+    HloComputation* computation, string domain_kind) {
+  auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind)));
+  TF_RETURN_IF_ERROR(domain_map->Populate(computation));
+  return std::move(domain_map);
+}
+
+/* static */ StatusOr<std::unique_ptr<HloDomainMap>> HloDomainMap::Create(
+    HloModule* module, string domain_kind) {
+  auto domain_map = WrapUnique(new HloDomainMap(std::move(domain_kind)));
+  for (HloComputation* computation : module->computations()) {
+    TF_RETURN_IF_ERROR(domain_map->Populate(computation));
+  }
+  return std::move(domain_map);
+}
+
+bool HloDomainMap::InSameDomain(HloInstruction* instruction1,
+                                HloInstruction* instruction2) const {
+  int64 domain_id1 = FindOrDefault(instruction_to_domain_, instruction1, -1);
+  int64 domain_id2 = FindOrDefault(instruction_to_domain_, instruction2, -1);
+  return domain_id1 >= 0 && domain_id1 == domain_id2;
+}
+
+Status HloDomainMap::TryProcessEmptyDomain(HloInstruction* instruction) {
+  TF_RET_CHECK(instruction->opcode() == HloOpcode::kDomain);
+  // We only check operands, so we are sure to not process the empty domain from
+  // both sides.
+  for (HloInstruction* operand : instruction->unique_operands()) {
+    if (IsDomainInstruction(operand)) {
+      auto domain = MakeUnique<DomainMetadata::Domain>();
+      domain->enter_domains.insert(operand);
+      domain->exit_domains.insert(instruction);
+      TF_RETURN_IF_ERROR(InsertDomain(std::move(domain)));
+    }
+  }
+  return Status::OK();
+}
+
+Status HloDomainMap::Populate(HloComputation* computation) {
+  for (HloInstruction* instruction : computation->instructions()) {
+    if (IsDomainInstruction(instruction)) {
+      // If this is a kDomain of the kind we are currently processing, check
+      // whether this is an "empty domain".
+      TF_RETURN_IF_ERROR(TryProcessEmptyDomain(instruction));
+      continue;
+    }
+    int64 domain_id = FindOrDefault(instruction_to_domain_, instruction, -1);
+    if (domain_id >= 0) {
+      // We have already processed this instruction.
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<DomainMetadata::Domain> domain,
+                        CreateDomain(instruction));
+    TF_RETURN_IF_ERROR(InsertDomain(std::move(domain)));
+  }
+  return Status::OK();
+}
+
+Status HloDomainMap::InsertDomain(
+    std::unique_ptr<DomainMetadata::Domain> domain) {
+  int64 domain_id = instruction_domains_.size();
+  instruction_domains_.push_back(std::move(domain));
+  for (HloInstruction* instruction : instruction_domains_.back()->reach_set) {
+    instruction_to_domain_[instruction] = domain_id;
+  }
+  return Status::OK();
+}
+
+Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
+                                  DomainMetadata::Domain* domain) const {
+  if (domain->reach_set.insert(instruction).second) {
+    // We should not be finding instructions with assigned domain here.
+    // If we assigned a domain to the instruction, it means that all the
+    // instructions reached by it, should have a domain as well.
+    int64 domain_id = FindOrDefault(instruction_to_domain_, instruction, -1);
+    TF_RET_CHECK(domain_id < 0) << "Instruction " << instruction->ToString()
+                                << " already has domain " << domain_id;
+    for (HloInstruction* operand : instruction->operands()) {
+      if (IsDomainInstruction(operand)) {
+        // The reach set instruction is a user of the domain instruction
+        // (the instruction sees the kDomain as operand).
+        // IOW the dataflow enters the domain through the kDomain instruction.
+        domain->enter_domains.insert(operand);
+      } else {
+        TF_RETURN_IF_ERROR(ExpandDomain(operand, domain));
+      }
+    }
+    for (HloInstruction* user : instruction->users()) {
+      if (IsDomainInstruction(user)) {
+        // The reach set instruction is an operand of the domain instruction
+        // (the instruction sees the kDomain as user).
+        // IOW the dataflow exits the domain through the kDomain instruction.
+        domain->exit_domains.insert(user);
+      } else {
+        TF_RETURN_IF_ERROR(ExpandDomain(user, domain));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+StatusOr<std::unique_ptr<DomainMetadata::Domain>> HloDomainMap::CreateDomain(
+    HloInstruction* instruction) const {
+  auto domain = MakeUnique<DomainMetadata::Domain>();
+  TF_RETURN_IF_ERROR(ExpandDomain(instruction, domain.get()));
+  domain->instructions = MakeNonDomainInstructions(domain->reach_set);
+  return std::move(domain);
+}
+
+bool HloDomainMap::IsDomainInstruction(HloInstruction* instruction) const {
+  if (instruction->opcode() != HloOpcode::kDomain) {
+    return false;
+  }
+  if (!domain_kind_.empty()) {
+    if (instruction->user_side_metadata().Kind() != domain_kind_) {
+      return false;
+    }
+    // Both user and operand side of the metadata must be of the same kind.
+    CHECK(instruction->operand_side_metadata().Kind() == domain_kind_)
+        << "Instruction " << instruction->ToString()
+        << " has mismatching metadata kinds";
+  }
+  return true;
+}
+
+/* static */ std::vector<HloInstruction*>
+HloDomainMap::MakeNonDomainInstructions(
+    const tensorflow::gtl::FlatSet<HloInstruction*>& instruction_set) {
+  std::vector<HloInstruction*> instructions;
+  instructions.reserve(instruction_set.size());
+  for (HloInstruction* instruction : instruction_set) {
+    if (instruction->opcode() != HloOpcode::kDomain) {
+      instructions.push_back(instruction);
+    }
+  }
+  std::sort(instructions.begin(), instructions.end(),
+            [](HloInstruction* a, HloInstruction* b) {
+              return a->unique_id() < b->unique_id();
+            });
+  return instructions;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.h b/tensorflow/compiler/xla/service/hlo_domain_map.h
new file mode 100644
index 0000000000..e62ef763fb
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.h
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_MAP_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_MAP_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// The HloDomainMap splits a set of instructions within a module or computation,
+// into different domains, separated by kDomain instructions.
+// A domain is composed by a set of instructions which can reach each other via
+// operand/user edges, without crossing a kDomain insutrction of a given kind.
+// A domain never crosses computation boundaries.
+class HloDomainMap {
+ public:
+  // Creates a new HloDomainMap, creating all the domains within the input
+  // computation, of the given kind. If domain_kind is not empty, only the
+  // kDomain instructions of domain_kind will be considered as separators.
+  // Otherwise every kDomain instruction will be splitting domains.
+  static StatusOr<std::unique_ptr<HloDomainMap>> Create(
+      HloComputation* computation, string domain_kind);
+
+  // Creates a new HloDomainMap, creating all the domains within the input
+  // module, of the given kind. If domain_kind is not empty, only the
+  // kDomain instructions of domain_kind will be considered as separators.
+  // Otherwise every kDomain instruction will be splitting domains.
+  static StatusOr<std::unique_ptr<HloDomainMap>> Create(HloModule* module,
+                                                        string domain_kind);
+
+  // Retrieves all the domains the input module or computation are composed by.
+  const std::vector<std::unique_ptr<DomainMetadata::Domain>>& GetDomains()
+      const {
+    return instruction_domains_;
+  }
+
+  // Checks whether two instructions are within the same domain.
+  bool InSameDomain(HloInstruction* instruction1,
+                    HloInstruction* instruction2) const;
+
+  // Checks whether instruction is a kDomain instruction of the kind we are
+  // currently processing.
+  bool IsDomainInstruction(HloInstruction* instruction) const;
+
+ private:
+  HloDomainMap(string domain_kind) : domain_kind_(std::move(domain_kind)) {}
+
+  // Check if the kDomain instruction is facing (via its operand link) another
+  // kDomain instruction of the same kind, hence defining an empty domain.
+  // If that is the case, create the empty domain and call the proper
+  // normalizer.
+  Status TryProcessEmptyDomain(HloInstruction* instruction);
+
+  Status Populate(HloComputation* computation);
+
+  // Inserts the provided domain into the ones tracked by this object,
+  // creating a new domain ID.
+  Status InsertDomain(std::unique_ptr<DomainMetadata::Domain> domain);
+
+  // From the given instruction, epxands operand and user wise, the set of
+  // instructions which can be reached without crossing a kDomain instruction
+  // of the kind specified by domain_kind_.
+  // The domain data structure will be populated with all the reached
+  // instructions, and the boundaries of the domain, with the kDomain
+  // instructions encountered while expanding the reach.
+  Status ExpandDomain(HloInstruction* instruction,
+                      DomainMetadata::Domain* domain) const;
+
+  // Creates a domain data structure using the ExpandDomain() API.
+  StatusOr<std::unique_ptr<DomainMetadata::Domain>> CreateDomain(
+      HloInstruction* instruction) const;
+
+  // Out of an instruction set, returns a vector of all the ones which are not
+  // a kDomain kind.
+  static std::vector<HloInstruction*> MakeNonDomainInstructions(
+      const tensorflow::gtl::FlatSet<HloInstruction*>& instruction_set);
+
+  string domain_kind_;
+  std::vector<std::unique_ptr<DomainMetadata::Domain>> instruction_domains_;
+  tensorflow::gtl::FlatMap<HloInstruction*, int64> instruction_to_domain_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_MAP_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
new file mode 100644
index 0000000000..9853bd39cd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
+
+namespace xla {
+
+// Cannot include hlo_instruction.h as this file is included from there.
+class HloInstruction;
+
+// The DomainMetadata represents the base class for metadata which can be
+// attached to kDomain HLO instructions.
+class DomainMetadata {
+ public:
+  // A Domain data structure captures all the information about a kDomain
+  // bounded instruction set.
+  struct Domain {
+    // The set of instructions which are reachable from each other via
+    // operand/user pathways, without crossing a kDomain instruction of a given
+    // kind. The reach_set can contain kDomain instructions of other kinds, if
+    // two domains of different kind intersect each other.
+    tensorflow::gtl::FlatSet<HloInstruction*> reach_set;
+
+    // The same instructions in reach_set, but purged from kDomain instructions.
+    std::vector<HloInstruction*> instructions;
+
+    // If we consider a graph edge as an arrow oriented from the operand to the
+    // user, the enter_domains will contain the set of kDomain instructions
+    // whose dataflow enters the reach set (domain), while the exit_domains
+    // contains the set of kDomain instructions whose dataflow exit the reach
+    // set.
+    tensorflow::gtl::FlatSet<HloInstruction*> enter_domains;
+    tensorflow::gtl::FlatSet<HloInstruction*> exit_domains;
+  };
+
+  virtual ~DomainMetadata() = default;
+
+  // Clones the metadata object.
+  virtual std::unique_ptr<DomainMetadata> Clone() const = 0;
+
+  // Returns the metadata type. A unique identifier which describes the real
+  // metadata type.
+  virtual tensorflow::StringPiece Kind() const = 0;
+
+  // Compares the metadata object with another one and returns true if the
+  // two matches.
+  virtual bool Matches(const DomainMetadata& other) const = 0;
+
+  // Returns a string representation of the metadata.
+  virtual string ToString() const = 0;
+
+  // Given a reachable set (the set of instructions which are reachable from
+  // each other via user/operand pathways, without crossing a kDomain
+  // instruciton), makes sure that all of them have metadata attributes which
+  // are coherent with this metadata object.
+  virtual Status NormalizeInstructions(const Domain& domain) const = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.cc b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
new file mode 100644
index 0000000000..1d06040b0e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.cc
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_map.h"
+#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/types.h"
+
+namespace xla {
+
+class HloDomainRemover::RunContext {
+ public:
+  RunContext(HloModule* module, HloDomainRemover* remover)
+      : module_(module), remover_(remover) {}
+
+  StatusOr<bool> Run();
+
+ private:
+  // Verifies the consistency of the domain, and normalizes the instructions
+  // within it.
+  Status VerifyAndNormalizeDomain(const DomainMetadata::Domain& domain);
+
+  HloModule* module_;
+  HloDomainRemover* remover_;
+};
+
+Status HloDomainRemover::RunContext::VerifyAndNormalizeDomain(
+    const DomainMetadata::Domain& domain) {
+  // Verify that the whole kDomain frontier bounding the instruction reach set,
+  // has matching metadata.
+  // A kDomain instruction has two sides of metadata, a user facing and an
+  // operand facing.
+  // A reachable instruction set can make contact with a kDomain instruction on
+  // a user facing side (the kDomain is operand of the instruction), or on a
+  // operand facing side (the kDomain is user of the instruction).
+  // And depending on the contact side, the proper metadata object
+  // (user_side_metadata() vs. operand_side_metadata()) needs to be used for
+  // consistency checks.
+  const DomainMetadata* ref_metadata = nullptr;
+  VLOG(4) << "Reach set:";
+  for (HloInstruction* instruction : domain.instructions) {
+    VLOG(4) << "  " << instruction->name();
+  }
+  VLOG(4) << "  Domains:";
+  for (HloInstruction* instruction : domain.enter_domains) {
+    const DomainMetadata& meta = instruction->user_side_metadata();
+    VLOG(4) << "    User side: " << instruction->name();
+    VLOG(4) << "      " << meta.ToString();
+    if (ref_metadata == nullptr) {
+      ref_metadata = &meta;
+    } else {
+      TF_RET_CHECK(meta.Matches(*ref_metadata))
+          << "Metadata mismatch at instruction " << instruction->name() << " : "
+          << meta.ToString() << " vs " << ref_metadata->ToString();
+    }
+  }
+  for (HloInstruction* instruction : domain.exit_domains) {
+    const DomainMetadata& meta = instruction->operand_side_metadata();
+    VLOG(4) << "    Operand side: " << instruction->name();
+    VLOG(4) << "      " << meta.ToString();
+    if (ref_metadata == nullptr) {
+      ref_metadata = &meta;
+    } else {
+      TF_RET_CHECK(meta.Matches(*ref_metadata))
+          << "Metadata mismatch at instruction " << instruction->name() << " : "
+          << meta.ToString() << " vs " << ref_metadata->ToString();
+    }
+  }
+  if (ref_metadata != nullptr) {
+    VLOG(4) << "Applying domain normalization: " << ref_metadata->ToString();
+    TF_RETURN_IF_ERROR(ref_metadata->NormalizeInstructions(domain));
+  } else {
+    // No kDomain instruction was present within this domain, so call the
+    // generic normalization functions and have them apply their heuristic.
+    VLOG(2) << "Applying domain-less normalization";
+    TF_RETURN_IF_ERROR(remover_->normalizer_(domain));
+  }
+  return Status::OK();
+}
+
+StatusOr<bool> HloDomainRemover::RunContext::Run() {
+  VLOG(4) << "Processing metadata domain: '" << remover_->kind_ << "'";
+  hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before Domain Remover");
+
+  int64 removed_domains = 0;
+  for (HloComputation* computation : module_->computations()) {
+    // First create the domain instruciton sets. A domain instruction set is
+    // the set of instructions whose edges never cross a kDomain instruction.
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloDomainMap> domain_map,
+                        HloDomainMap::Create(computation, remover_->kind_));
+    // Verify and normalize every domain populated within the map.
+    for (auto& domain : domain_map->GetDomains()) {
+      TF_RETURN_IF_ERROR(VerifyAndNormalizeDomain(*domain));
+    }
+
+    // Now remove all the kDomain instructions of the kind specified by the
+    // remover, that are within the currently processed computation from the
+    // graph.
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      for (HloInstruction* operand : instruction->unique_operands()) {
+        if (domain_map->IsDomainInstruction(operand)) {
+          VLOG(5) << "Removing " << operand->name();
+          TF_RETURN_IF_ERROR(
+              operand->ReplaceAllUsesWith(operand->mutable_operand(0)));
+          TF_RETURN_IF_ERROR(computation->RemoveInstruction(operand));
+          ++removed_domains;
+        }
+      }
+    }
+    HloInstruction* root = computation->root_instruction();
+    if (root != nullptr && domain_map->IsDomainInstruction(root)) {
+      VLOG(5) << "Removing " << root->name();
+      computation->set_root_instruction(root->mutable_operand(0));
+      TF_RETURN_IF_ERROR(computation->RemoveInstruction(root));
+      ++removed_domains;
+    }
+  }
+  VLOG(3) << "Removed " << removed_domains << " kDomain instructions of '"
+          << remover_->kind_ << "' kind";
+  if (removed_domains > 0) {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "After Domain Remover");
+  }
+  return removed_domains > 0;
+}
+
+StatusOr<bool> HloDomainRemover::Run(HloModule* module) {
+  RunContext run_context(module, this);
+  return run_context.Run();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_domain_remover.h b/tensorflow/compiler/xla/service/hlo_domain_remover.h
new file mode 100644
index 0000000000..0c71dd34fd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_remover.h
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace xla {
+
+// Removes all the kDomain instructions of a given kind from the input module,
+// and calls the normalizer to propagate the properties on the possibly new born
+// instructions.
+class HloDomainRemover : public HloPassInterface {
+ public:
+  // Creates a new HloDomainRemover object tasked at removing all the kDomain
+  // instructions of a given kind.
+  // In case a reachable set (the set of instructions within a computation,
+  // which are mutually reachable via operand/user pathways) has all the
+  // instructions in it with the same attributes (ie, sharding), a normalizer
+  // function is tasked at applying attribute normalization on the instructions
+  // within such domain.
+  HloDomainRemover(
+      tensorflow::StringPiece kind,
+      std::function<Status(const DomainMetadata::Domain&)> normalizer)
+      : kind_(kind.ToString()), normalizer_(std::move(normalizer)) {}
+
+  tensorflow::StringPiece name() const override { return "domain_remover"; }
+
+  StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  class RunContext;
+
+  string kind_;
+  std::function<Status(const DomainMetadata::Domain&)> normalizer_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_DOMAIN_REMOVER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
new file mode 100644
index 0000000000..f29aac29c0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -0,0 +1,432 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+class HloDomainTest : public HloTestBase {
+ protected:
+  bool FindUserViaDomainPath(HloInstruction* instruction,
+                             HloInstruction* operand) const {
+    for (HloInstruction* user : operand->users()) {
+      if (user == instruction) {
+        return true;
+      }
+      if (user->opcode() == HloOpcode::kDomain &&
+          FindUserViaDomainPath(instruction, user)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  // Checks whether there is a kDomain instruction in the edge between the
+  // instruction and the operand.
+  bool HasDomainEdge(HloModule* module,
+                     tensorflow::StringPiece instruction_name,
+                     tensorflow::StringPiece operand_name) {
+    HloInstruction* instruction = FindInstruction(module, instruction_name);
+    HloInstruction* operand = FindInstruction(module, operand_name);
+    CHECK_NE(instruction, nullptr);
+    CHECK_NE(operand, nullptr);
+    if (!instruction->IsUserOf(operand)) {
+      // If instruction is not an immediate user, we must find a path from
+      // operand to instruction anyway, otherwise there is a corruption.
+      if (FindUserViaDomainPath(instruction, operand)) {
+        return true;
+      }
+      LOG(FATAL) << "Bad HLO module generated across the '" << instruction_name
+                 << "' and '" << operand_name << "' instructions:\n"
+                 << module->ToString();
+    }
+    return false;
+  }
+
+  StatusOr<std::unique_ptr<HloModule>> ParseModule(
+      tensorflow::StringPiece hlo_string) {
+    HloModuleConfig config;
+    config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
+    return tools::Parse(hlo_string, config);
+  }
+};
+
+// Dummy DomainMetadata implementation which create kDomain boundaries around
+// HLO instructions with the same metadata().op_name() values.
+class OpNameMetadata : public DomainMetadata {
+ public:
+  explicit OpNameMetadata(string opname) : opname_(std::move(opname)) {}
+
+  std::unique_ptr<DomainMetadata> Clone() const override {
+    return MakeUnique<OpNameMetadata>(opname_);
+  }
+
+  tensorflow::StringPiece Kind() const override { return KindName(); }
+
+  bool Matches(const DomainMetadata& other) const override {
+    const OpNameMetadata* other_ptr =
+        dynamic_cast<const OpNameMetadata*>(&other);
+    if (other_ptr == nullptr) {
+      // If other is not a OpNameMetadata, then it is clearly a no match.
+      return false;
+    }
+    return opname_ == other_ptr->opname_;
+  }
+
+  string ToString() const override { return opname_; }
+
+  Status NormalizeInstructions(
+      const DomainMetadata::Domain& domain) const override {
+    // For the purposes of this test, nothing to do.
+    return Status::OK();
+  }
+
+  static tensorflow::StringPiece KindName() { return "opname"; }
+
+ private:
+  string opname_;
+};
+
+// Creator function for OpNameMetadata domains.
+std::unique_ptr<HloInstruction> OpNameDomainCreator(HloInstruction* instruction,
+                                                    HloInstruction* operand) {
+  if (instruction->metadata().op_name() == operand->metadata().op_name()) {
+    return nullptr;
+  }
+  std::unique_ptr<DomainMetadata> operand_side_metadata =
+      MakeUnique<OpNameMetadata>(operand->metadata().op_name());
+  std::unique_ptr<DomainMetadata> user_side_metadata =
+      MakeUnique<OpNameMetadata>(instruction->metadata().op_name());
+  return HloInstruction::CreateDomain(operand->shape(), operand,
+                                      std::move(operand_side_metadata),
+                                      std::move(user_side_metadata));
+}
+
+Status OpNameDomainNormalizer(const DomainMetadata::Domain& domain) {
+  // Nothing to do for the particular use this test make of the OpName domains.
+  return Status::OK();
+}
+
+TEST_F(HloDomainTest, CheckDomainLinks) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4], f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = f32[4] get-tuple-element(p0), index=1
+  c = f32[4] add(f32[4] a, f32[4] b), sharding={maximal device=1}
+  d = f32[4] subtract(a, b), sharding={maximal device=1}
+  e = f32[4] multiply(c, d), sharding={maximal device=1}
+  ROOT f = (f32[4], f32[4], f32[4]) tuple(c, d, e)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+}
+
+TEST_F(HloDomainTest, CheckNoDomainAddedIfNoSharding) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4], f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = f32[4] get-tuple-element(p0), index=1
+  c = f32[4] add(f32[4] a, f32[4] b)
+  d = f32[4] subtract(a, b)
+  e = f32[4] multiply(c, d)
+  ROOT f = (f32[4], f32[4], f32[4]) tuple(c, d, e)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(!isolator_changed);
+}
+
+TEST_F(HloDomainTest, CheckDomainAroundIO) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = (f32[4], u32[]) send(a), channel_id=1, sharding={maximal device=0}
+  c = () send-done(b), channel_id=1, sharding={maximal device=0}
+  d = (f32[4], u32[]) recv(), channel_id=2, sharding={maximal device=0}
+  e = f32[4] recv-done(d), channel_id=2, sharding={maximal device=0}
+  f = f32[4] add(a, e)
+  g = f32[4] subtract(a, e)
+  ROOT h = (f32[4], f32[4]) tuple(f, g)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "f", "e"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "a", "p0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module.get(), "b", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "f", "e"));
+}
+
+TEST_F(HloDomainTest, CheckNoDomainAddedOnPureIOComputation) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  a = (f32[4], u32[]) recv(), channel_id=1, sharding={maximal device=-1}
+  b = f32[4] recv-done(a), channel_id=1, sharding={maximal device=-1}
+  c = f32[4] add(b, b), sharding={maximal device=-1}
+  d = (f32[4], u32[]) send(c), channel_id=2, sharding={maximal device=-1}
+  ROOT e = () send-done(d), channel_id=2, sharding={maximal device=-1}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_FALSE(isolator_changed);
+}
+
+TEST_F(HloDomainTest, CheckNormalizationOnPureIOComputation) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  a = (f32[4], u32[]) recv(), channel_id=1, sharding={maximal device=0}
+  b = f32[4] recv-done(a), channel_id=1, sharding={maximal device=0}
+  c = f32[4] add(b, b)
+  d = (f32[4], u32[]) send(c), channel_id=2, sharding={maximal device=0}
+  ROOT e = () send-done(d), channel_id=2, sharding={maximal device=0}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_FALSE(remover_changed);
+
+  HloInstruction* add = FindInstruction(module.get(), "c");
+  ASSERT_NE(add, nullptr);
+  auto device = add->sharding_unique_device();
+  EXPECT_TRUE(device.has_value());
+  EXPECT_EQ(*device, 0);
+}
+
+TEST_F(HloDomainTest, CheckMultiDomainLinks) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  p0 = (f32[4], f32[4]) parameter(0)
+  a = f32[4] get-tuple-element(p0), index=0
+  b = f32[4] get-tuple-element(p0), index=1
+  c = f32[4] add(a, b), sharding={maximal device=1}
+  d = f32[4] subtract(a, c), sharding={maximal device=1}, metadata={op_name="D"}
+  e = f32[4] multiply(c, d), sharding={maximal device=1}, metadata={op_name="D"}
+  f = f32[4] add(e, c), sharding={maximal device=1}
+  ROOT g = (f32[4], f32[4], f32[4]) tuple(c, d, f)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator sharding_isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool sharding_isolator_changed,
+                          sharding_isolator.Run(module.get()));
+  EXPECT_TRUE(sharding_isolator_changed);
+
+  HloDomainIsolator opname_isolator(OpNameDomainCreator);
+  TF_ASSERT_OK_AND_ASSIGN(bool opname_isolator_changed,
+                          opname_isolator.Run(module.get()));
+  EXPECT_TRUE(opname_isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "d", "c"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "e", "d"));
+
+  HloDomainRemover sharding_remover(ShardingMetadata::KindName(),
+                                    NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool sharding_remover_changed,
+                          sharding_remover.Run(module.get()));
+  EXPECT_TRUE(sharding_remover_changed);
+
+  HloDomainRemover opname_remover(OpNameMetadata::KindName(),
+                                  OpNameDomainNormalizer);
+  TF_ASSERT_OK_AND_ASSIGN(bool opname_remover_changed,
+                          opname_remover.Run(module.get()));
+  EXPECT_TRUE(opname_remover_changed);
+
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "c", "b"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "a"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "d", "c"));
+}
+
+TEST_F(HloDomainTest, CheckNormalizationOnInfeedTuple) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+ENTRY entry {
+  infeed = (f32[4], f32[4]) infeed(),
+    sharding={{maximal device=1}, {maximal device=0}}
+  gte0 = f32[4] get-tuple-element(infeed), index=0
+  gte1 = f32[4] get-tuple-element(infeed), index=1
+  copy0 = f32[4] copy(gte0)
+  copy1 = f32[4] copy(gte1)
+  ROOT add = f32[4] add(copy0, copy1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseModule(hlo_string));
+  LOG(INFO) << "Original module:\n" << module->ToString();
+
+  HloDomainIsolator isolator(CreateShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool isolator_changed, isolator.Run(module.get()));
+  EXPECT_TRUE(isolator_changed);
+
+  EXPECT_TRUE(HasDomainEdge(module.get(), "gte0", "infeed"));
+  EXPECT_TRUE(HasDomainEdge(module.get(), "gte1", "infeed"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy0", "gte0"));
+  EXPECT_FALSE(HasDomainEdge(module.get(), "copy1", "gte1"));
+
+  // Inject unassigned tuple/gte within the infeed domain, to simulate the
+  // HLO passes adding unexpected instructions.
+  //
+  //            infeed
+  //           /      \
+  //         GTE0    GTE1
+  //         /          \
+  //       COPY0       COPY1
+  //          \         /
+  //           \       /
+  //             TUPLE
+  //               |
+  //             DOMAIN
+  HloInstruction* infeed = FindInstruction(module.get(), "infeed");
+  ASSERT_NE(infeed, nullptr);
+  auto infeed_users = infeed->users();
+  HloInstruction* new_gte0 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+          ShapeUtil::GetTupleElementShape(infeed->shape(), 0), infeed, 0));
+  HloInstruction* new_copy0 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateUnary(
+          new_gte0->shape(), HloOpcode::kCopy, new_gte0));
+  HloInstruction* new_gte1 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateGetTupleElement(
+          ShapeUtil::GetTupleElementShape(infeed->shape(), 1), infeed, 1));
+  HloInstruction* new_copy1 =
+      infeed->parent()->AddInstruction(HloInstruction::CreateUnary(
+          new_gte1->shape(), HloOpcode::kCopy, new_gte1));
+  HloInstruction* new_tuple = infeed->parent()->AddInstruction(
+      HloInstruction::CreateTuple({new_copy0, new_copy1}));
+  for (HloInstruction* user : infeed_users) {
+    TF_EXPECT_OK(infeed->ReplaceUseWith(user, new_tuple));
+  }
+
+  HloDomainRemover remover(ShardingMetadata::KindName(),
+                           NormalizeShardingDomain);
+  TF_ASSERT_OK_AND_ASSIGN(bool remover_changed, remover.Run(module.get()));
+  EXPECT_TRUE(remover_changed);
+
+  struct Assignment {
+    HloInstruction* instruction;
+    int64 device;
+  } assignments[] = {
+      {new_gte0, 1},
+      {new_copy0, 1},
+      {new_gte1, 0},
+      {new_copy1, 0},
+  };
+  for (auto& assignment : assignments) {
+    auto device = assignment.instruction->sharding_unique_device();
+    EXPECT_TRUE(device.has_value());
+    EXPECT_EQ(*device, assignment.device);
+  }
+  EXPECT_TRUE(new_tuple->has_sharding());
+  EXPECT_EQ(
+      new_tuple->sharding(),
+      HloSharding::Tuple(new_tuple->shape(), {HloSharding::AssignDevice(1),
+                                              HloSharding::AssignDevice(0)}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index d236f83aeb..abec29df43 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -119,6 +119,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
     return false;
   }
 
+  HloCloneContext context(module);
   bool changed = false;
   for (auto* computation : module->computations()) {
     for (auto* hlo : computation->MakeInstructionPostOrder()) {
@@ -180,7 +181,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
             ShapeUtil::ChangeElementType(hlo->shape(), replace_with_type_);
 
         new_hlo = computation->AddInstruction(
-            hlo->CloneWithNewOperands(shape, new_operands, hlo->GetModule()));
+            hlo->CloneWithNewOperands(shape, new_operands, &context));
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
 
         new_hlo = ToElementType(new_hlo, eliminate_type_);
@@ -189,16 +190,16 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
         Shape new_shape = GetConvertedTupleShape(hlo->shape(), eliminate_type_,
                                                  replace_with_type_);
 
-        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
-            new_shape, new_operands, hlo->GetModule()));
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(new_shape, new_operands, &context));
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
 
         // Convert the elements of the result of `new_hlo` to produce a new
         // tuple with shape `old_shape`.
         new_hlo = ConvertTupleElements(new_hlo, old_shape);
       } else {
-        new_hlo = computation->AddInstruction(hlo->CloneWithNewOperands(
-            hlo->shape(), new_operands, hlo->GetModule()));
+        new_hlo = computation->AddInstruction(
+            hlo->CloneWithNewOperands(hlo->shape(), new_operands, &context));
         TF_RETURN_IF_ERROR(new_hlo->CopyAllControlDepsFrom(hlo));
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index e90eb0669d..1e78d775c8 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -965,9 +965,10 @@ Status HloEvaluator::HandleFusion(HloInstruction* fusion) {
   // Attach cloned computation to an empty HLO module so the existing ones are
   // not modified.
   HloModule empty_hlo_module("EmptyModuleForFusion", config);
+  HloCloneContext context(&empty_hlo_module);
   auto cloned_fused_computation =
       fusion->fused_instructions_computation()->Clone(
-          /*suffix=*/"clone_with_layout", &empty_hlo_module);
+          /*suffix=*/"clone_with_layout", &context);
   for (auto* instruction : cloned_fused_computation->instructions()) {
     LayoutUtil::SetToDefaultLayout(instruction->mutable_shape());
   }
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index efdeb6c64f..672b1c017a 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1010,6 +1010,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kSelectAndScatter:
       return kPurple;
+    case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kMap:
       return kGray;
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index db1c33e2f0..dc351e9968 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -256,6 +257,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
     case HloOpcode::kClz:
+    case HloOpcode::kDomain:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
@@ -821,6 +823,15 @@ HloInstruction::CreateBroadcastSequence(
   return instruction;
 }
 
+void HloInstruction::set_device_sharding(int64 device) {
+  HloSharding device_sharding = HloSharding::AssignDevice(device);
+  if (ShapeUtil::IsTuple(shape())) {
+    set_sharding(HloSharding::Tuple(device_sharding.GetAsShapeTree(shape())));
+  } else {
+    set_sharding(device_sharding);
+  }
+}
+
 void HloInstruction::SetupDerivedInstruction(
     HloInstruction* derived_instruction) const {
   if (sharding_ != nullptr) {
@@ -1225,21 +1236,28 @@ bool HloInstruction::HasSideEffect() const {
   return gather_dim_numbers;
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDomain(
+    const Shape& shape, HloInstruction* operand,
+    std::unique_ptr<DomainMetadata> operand_side_metadata,
+    std::unique_ptr<DomainMetadata> user_side_metadata) {
+  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kDomain, shape));
+  instruction->operand_side_metadata_ = std::move(operand_side_metadata);
+  instruction->user_side_metadata_ = std::move(user_side_metadata);
+  instruction->AppendOperand(operand);
+  return instruction;
+}
+
 std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     const Shape& shape,
     tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
-    HloModule* module, CloneMap* clone_map) const {
+    HloCloneContext* context) const {
   VLOG(3) << "CloneWithNewOperands:\n  " << ToString();
   VLOG(3) << "  new operands:";
   for (const HloInstruction* new_operand : new_operands) {
     VLOG(3) << "    %" << new_operand->name();
   }
-  if (module == nullptr) {
-    module = GetModule();
-  }
 
   std::unique_ptr<HloInstruction> clone;
-
   // Explicitly call the factory for the instruction type. This is more robust
   // in the face of code changes than copying fields explicitly. This also
   // properly sets the user fields of the operands.
@@ -1419,9 +1437,16 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateConstant(literal_->CloneToUnique());
       break;
     case HloOpcode::kFusion: {
-      CHECK_NE(module, nullptr);
-      auto new_fused_computation = module->AddEmbeddedComputation(
-          fused_instructions_computation()->Clone("clone", module, clone_map));
+      HloModule* module = context != nullptr ? context->module() : GetModule();
+      HloComputation* new_fused_computation = nullptr;
+      if (context != nullptr) {
+        new_fused_computation =
+            context->FindComputation(fused_instructions_computation());
+      }
+      if (new_fused_computation == nullptr) {
+        new_fused_computation = module->AddEmbeddedComputation(
+            fused_instructions_computation()->Clone("clone", context));
+      }
       clone = CreateFusion(/*shape=*/shape, /*fusion_kind=*/fusion_kind(),
                            /*operands=*/new_operands,
                            /*fusion_computation=*/new_fused_computation);
@@ -1485,14 +1510,25 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateGather(shape, new_operands[0], new_operands[1],
                            *gather_dimension_numbers_, gather_window_bounds_);
       break;
+    case HloOpcode::kDomain:
+      CHECK_EQ(new_operands.size(), 1);
+      clone =
+          CreateDomain(shape, new_operands[0], operand_side_metadata_->Clone(),
+                       user_side_metadata_->Clone());
+      break;
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
   clone->set_backend_config(backend_config());
-  if (clone_map != nullptr) {
-    InsertOrDie(clone_map, this, clone.get());
+  if (context != nullptr) {
+    context->MapInstruction(this, clone.get());
+    clone->ReplaceCalledComputations([&](HloComputation* callee) {
+      return callee->parent() != context->module()
+                 ? context->module()->DeepCloneComputation(callee, context)
+                 : callee;
+    });
   }
   return clone;
 }
@@ -1500,9 +1536,9 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
 HloInstruction::~HloInstruction() {}
 
 std::unique_ptr<HloInstruction> HloInstruction::Clone(
-    const string& suffix, HloModule* module, CloneMap* clone_map) const {
+    const string& suffix, HloCloneContext* context) const {
   std::unique_ptr<HloInstruction> clone =
-      CloneWithNewOperands(shape_, operands_, module, clone_map);
+      CloneWithNewOperands(shape_, operands_, context);
   if (suffix.empty()) {
     clone->name_ = name();
   } else {
@@ -1614,6 +1650,17 @@ int64 HloInstruction::operand_index(const HloInstruction* target) const {
   LOG(FATAL) << "target was not an operand: " << target->ToString();
 }
 
+HloInstruction::InstructionVector HloInstruction::unique_operands() const {
+  InstructionVector unique;
+  tensorflow::gtl::FlatSet<const HloInstruction*> seen;
+  for (HloInstruction* operand : operands()) {
+    if (seen.insert(operand).second) {
+      unique.push_back(operand);
+    }
+  }
+  return unique;
+}
+
 Status HloInstruction::AddControlDependencyTo(HloInstruction* instruction) {
   TF_RET_CHECK(instruction->parent() == parent());
   if (std::find(control_successors_.begin(), control_successors_.end(),
@@ -1758,6 +1805,7 @@ bool HloInstruction::IdenticalSlowPath(
                              other.fused_instructions_computation());
 
     // These opcodes have complex or special behavior so just return false.
+    case HloOpcode::kDomain:
     case HloOpcode::kRng:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
@@ -2369,7 +2417,13 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(StrCat("exponent_bits=", exponent_bits_));
     extra.push_back(StrCat("mantissa_bits=", mantissa_bits_));
   }
-
+  if (operand_side_metadata_ != nullptr) {
+    extra.push_back(
+        StrCat("operand_side=", operand_side_metadata_->ToString()));
+  }
+  if (user_side_metadata_ != nullptr) {
+    extra.push_back(StrCat("user_side=", user_side_metadata_->ToString()));
+  }
   // By contract, we print the custom call target even if
   // options.print_subcomputation_mode() == kOff, because the call target is not
   // an HloComputation.
@@ -2546,6 +2600,7 @@ bool HloInstruction::IsFusable() const {
   }
   // Some kinds of instructions don't make sense to fuse.
   switch (opcode_) {
+    case HloOpcode::kDomain:
     case HloOpcode::kParameter:
       return false;
     // Side effecting instrutions cannot be fused.
@@ -2558,7 +2613,9 @@ HloComputation* HloInstruction::fused_instructions_computation() const {
   CHECK_EQ(opcode_, HloOpcode::kFusion);
   CHECK(!called_computations_.empty());
   auto* fused_instructions_computation = called_computations_.front();
-  CHECK(fused_instructions_computation->IsFusionComputation());
+  CHECK(fused_instructions_computation->IsFusionComputation())
+      << "Computation " << fused_instructions_computation->name()
+      << " is not a fusion kind";
   return fused_instructions_computation;
 }
 
@@ -2773,6 +2830,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleSendDone(this);
     case HloOpcode::kGather:
       return visitor->HandleGather(this);
+    case HloOpcode::kDomain:
+      return visitor->HandleDomain(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 234dbc8399..6df97c40ba 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -37,6 +37,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
@@ -597,6 +599,13 @@ class HloInstruction {
       const GatherDimensionNumbers& gather_dim_numbers,
       tensorflow::gtl::ArraySlice<int64> window_bounds);
 
+  // Creates a kDomain instruction which delimits an HLO domain which have
+  // the provided user and operand side metadata.
+  static std::unique_ptr<HloInstruction> CreateDomain(
+      const Shape& shape, HloInstruction* operand,
+      std::unique_ptr<DomainMetadata> operand_side_metadata,
+      std::unique_ptr<DomainMetadata> user_side_metadata);
+
   // Creates a fusion instruction. A fusion instruction contains one or more
   // fused instructions forming an expression with a single root
   // "fused_root". Additional instructions can be added to the fusion
@@ -676,6 +685,10 @@ class HloInstruction {
   using InstructionVector = tensorflow::gtl::InlinedVector<HloInstruction*, 2>;
   const InstructionVector& operands() const { return operands_; }
 
+  // Returns the vector of unique operands, in the same order they are found
+  // within the operand vector.
+  InstructionVector unique_operands() const;
+
   // Returns the index of 'target' in the operands sequence.
   // Precondition: target must be an operand (or a fatal error will occur).
   int64 operand_index(const HloInstruction* target) const;
@@ -1094,16 +1107,20 @@ class HloInstruction {
   }
   // Returns the sharding unique device, if any.
   tensorflow::gtl::optional<int64> sharding_unique_device() const {
-    if (sharding_ == nullptr || !sharding_->HasUniqueDevice()) {
+    if (sharding_ == nullptr) {
       return tensorflow::gtl::optional<int64>();
     }
-    return sharding_->UniqueDevice().ValueOrDie();
+    auto device = sharding_->UniqueDevice();
+    return device.ok() ? device.ValueOrDie()
+                       : tensorflow::gtl::optional<int64>();
   }
   // Sets the sharding of this operator. Should only be called by HloModule or
   // HloComputation methods.
   void set_sharding(const HloSharding& sharding) {
     sharding_ = MakeUnique<HloSharding>(sharding);
   }
+  // Sets a sharding that assigns the current instruction to device.
+  void set_device_sharding(int64 device);
   // Remove any sharding from this operator.
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
@@ -1117,6 +1134,15 @@ class HloInstruction {
     return other->has_sharding() ? sharding() == other->sharding() : false;
   }
 
+  // Retrieves the operand side metadata of a kDomain instruction.
+  const DomainMetadata& operand_side_metadata() const {
+    return *operand_side_metadata_;
+  }
+  // Retrieves the user side metadata of a kDomain instruction.
+  const DomainMetadata& user_side_metadata() const {
+    return *user_side_metadata_;
+  }
+
   // When creating a new instruction which either replaces, or shifts up (kCopy
   // insertion case), another instruction, we need to make sure the certain
   // properties of the new instruction are copied into the derived one. As of
@@ -1317,30 +1343,18 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kRng
   RandomDistribution random_distribution() const;
 
-  // See documentation for Clone().
-  using CloneMap = std::unordered_map<const HloInstruction*, HloInstruction*>;
-
   // Clones the HLO instruction. The clone will have the same opcode, shape, and
   // operands. After creation the clone has no uses. "this" (the instruction
   // cloned from) is not changed. Suffix is the string to append to the name of
-  // the instruction to form the name of the cloned instruction. Ignores the
-  // control predecessors and successors of this HLO instruction.
-  //
-  // If the module pointer is not nullptr, then any cloned computations will be
-  // added to this module in order to support deep cloning. Otherwise the module
-  // of the instruction is used.
-  //
-  // If clone_map is not nullptr, then each original instruction that is cloned
-  // will be inserted and map to its clone. clone_map should not already contain
-  // any of the instructions to clone.
-  std::unique_ptr<HloInstruction> Clone(const string& suffix = "clone",
-                                        HloModule* module = nullptr,
-                                        CloneMap* clone_map = nullptr) const;
+  // the instruction to form the name of the cloned instruction.
+  // Ignores the control predecessors and successors of this HLO instruction.
+  std::unique_ptr<HloInstruction> Clone(
+      const string& suffix = "clone", HloCloneContext* context = nullptr) const;
 
   // Clones the HLO instruction as above but with new shape and operands.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloModule* module = nullptr, CloneMap* clone_map = nullptr) const;
+      HloCloneContext* context = nullptr) const;
 
   // Returns the computations this instruction directly calls (if any).
   const std::vector<HloComputation*>& called_computations() const {
@@ -1553,7 +1567,7 @@ class HloInstruction {
   // Clones a fusion instruction with a new shape and operands.
   std::unique_ptr<HloInstruction> CloneFusionWithNewOperands(
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
-      HloModule* module = nullptr) const;
+      HloCloneContext* context = nullptr) const;
 
   // Returns true if this instruction can legally have the dimensions field
   // set. Used for checking precondition of dimensions field accessors.
@@ -1646,6 +1660,10 @@ class HloInstruction {
   // The sharding, if one exists.
   std::unique_ptr<HloSharding> sharding_;
 
+  // Fields used by the kDomain instruction.
+  std::unique_ptr<DomainMetadata> operand_side_metadata_;
+  std::unique_ptr<DomainMetadata> user_side_metadata_;
+
   // For parameter instructions this field holds the parameter number.
   int64 parameter_number_ = 0;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index a61c472c72..e91cf2076f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -1494,5 +1495,52 @@ TEST_F(HloInstructionTest, CanonnicalStringificationConditional) {
 })");
 }
 
+TEST_F(HloInstructionTest, CheckDeepClone) {
+  const char* const hlo_string = R"(
+HloModule Module
+
+addy (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT zadd = s32[] add(lhs, rhs)
+}
+
+calla (x: s32[]) -> s32[] {
+  x = s32[] parameter(0)
+  reduce = s32[] reduce-window(x, x), to_apply=addy
+  ROOT xadd = s32[] add(x, reduce)
+}
+
+body (bparam: s32[]) -> s32[] {
+  constant = s32[] constant(1)
+  bparam = s32[] parameter(0)
+  v = s32[] call(bparam), to_apply=calla
+  ROOT add = s32[] add(constant, bparam)
+}
+
+condition (cparam: s32[]) -> pred[] {
+  xconstant = s32[] constant(5)
+  cparam = s32[] parameter(0)
+  ROOT greater-than = pred[] greater-than(xconstant, cparam)
+}
+
+ENTRY entry (param: s32[]) -> s32[] {
+  eparam = s32[] parameter(0)
+  ROOT while = s32[] while(eparam), condition=condition, body=body
+ }
+)";
+  // Check that deep clones really deep clones every instruction and
+  // computations, without leaving dangling pointers to the old module.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+  std::unique_ptr<HloModule> clone = module->Clone();
+  for (HloComputation* computation : clone->computations()) {
+    EXPECT_EQ(computation->parent(), clone.get());
+    for (HloInstruction* instruction : computation->instructions()) {
+      EXPECT_EQ(instruction->parent()->parent(), clone.get());
+    }
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index fbf1d58007..e63424c2df 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -496,7 +496,18 @@ std::list<HloComputation*> HloModule::MakeComputationPostOrder() const {
       added_computations.insert(computation.get());
     }
   }
-  CHECK_EQ(post_order.size(), computations_.size());
+  if (post_order.size() != computations_.size()) {
+    for (HloComputation* computation : post_order) {
+      LOG(ERROR) << "Post Order: " << computation->name() << " ("
+                 << computation->parent()->name() << ")";
+    }
+    for (auto& computation : computations_) {
+      LOG(ERROR) << "Computations: " << computation->name() << " ("
+                 << computation->parent()->name() << ")";
+    }
+    LOG(FATAL) << "Mismatch computation count: post_order=" << post_order.size()
+               << " computation_count=" << computations_.size();
+  }
   return post_order;
 }
 
@@ -517,54 +528,25 @@ std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
   module->entry_computation_handle_ = entry_computation_handle_;
   module->has_entry_computation_handle_ = has_entry_computation_handle_;
 
-  std::unordered_map<HloComputation*, HloComputation*> clone_map;
-  for (auto& computation : computations_) {
-    if (computation->IsFusionComputation()) {
-      // Cloning of a fused computation is handled by its fusion instruction.
-      continue;
-    }
-
-    // When cloning a computation, pass in the new module, so that for any
-    // fusion instruction in this computation, the fused computation will be
-    // deep cloned to the new module.
-    auto cloned_computation = computation->Clone(suffix, module.get());
-    InsertOrDie(&clone_map, computation.get(), cloned_computation.get());
-
-    if (entry_computation_ == computation.get()) {
-      module->AddEntryComputation(std::move(cloned_computation));
-    } else {
-      module->AddEmbeddedComputation(std::move(cloned_computation));
-    }
-  }
-
-  for (auto& cloned_computation : module->computations_) {
-    for (auto* instruction : cloned_computation->instructions()) {
-      // Rewrite instruction's called_computation to point to the cloned
-      // computations.
-      instruction->ReplaceCalledComputations([&](HloComputation* hlo) {
-        if (hlo->IsFusionComputation()) {
-          // Cloning of a fused computation has already been handled when its
-          // fusion instruction is cloned. So this hlo computation is already
-          // the cloned one.
-          return hlo;
-        }
-        return FindOrDie(clone_map, hlo);
-      });
-    }
-  }
+  HloCloneContext context(module.get(), suffix);
+  auto cloned_computation = entry_computation_->Clone(suffix, &context);
+  module->AddEntryComputation(std::move(cloned_computation));
   return module;
 }
 
-HloComputation* HloModule::DeepCloneComputation(HloComputation* computation) {
-  HloComputation* clone = AddEmbeddedComputation(computation->Clone("", this));
-  TF_CHECK_OK(
-      clone->root_instruction()->Accept([this](HloInstruction* instruction) {
-        instruction->ReplaceCalledComputations([this](HloComputation* callee) {
-          return DeepCloneComputation(callee);
-        });
-        return Status::OK();
-      }));
-  return clone;
+HloComputation* HloModule::DeepCloneComputation(HloComputation* computation,
+                                                HloCloneContext* context) {
+  HloComputation* new_computation;
+  if (context != nullptr) {
+    if ((new_computation = context->FindComputation(computation)) != nullptr) {
+      return new_computation;
+    }
+    new_computation =
+        AddEmbeddedComputation(computation->Clone(context->suffix(), context));
+  } else {
+    new_computation = AddEmbeddedComputation(computation->Clone(""));
+  }
+  return new_computation;
 }
 
 uint64 HloModule::RandomNew64() const {
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 02918c3777..c93c74d34a 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/iterator_util.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
@@ -94,8 +95,10 @@ class HloModule {
   std::unique_ptr<HloModule> Clone(const string& suffix = "clone") const;
 
   // Performs a deep clone of the computation, by recursively cloning all
-  // the called computations as well.
-  HloComputation* DeepCloneComputation(HloComputation* computation);
+  // the called computations as well. If the clone context is specified, it
+  // will be populated with the cloned object mappings.
+  HloComputation* DeepCloneComputation(HloComputation* computation,
+                                       HloCloneContext* context = nullptr);
 
   // Return a pointer to the entry computation of the module..
   const HloComputation* entry_computation() const {
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index b4cd3c730e..7d706b5fd0 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -87,6 +87,7 @@ Status HloModuleGroupMetadata::Build() {
           << "Peer instruction does not match the computation kind";
       TF_RETURN_IF_ERROR(
           AddCompanion(tracked->instruction(), peer_tracked->instruction()));
+      tracked_instructions_comms_[tracked->instruction()].push_back(hlo);
     }
 
     // Add the parents of companion instructions (they must be all of the same
@@ -116,23 +117,31 @@ Status HloModuleGroupMetadata::Build() {
 }
 
 Status HloModuleGroupMetadata::VerifyCompanionSets() const {
-  // TODO(dlibenzi): Migrate this to use the device instead of module ID, once
-  // the kDomain CL goes in.
   for (const auto& companions : companion_sets_) {
     // A companion set must be composed at most of an instruction per
     // device/module.
     std::unordered_set<int64> devices;
     for (HloInstruction* instruction : *companions) {
-      int64 device = GetModuleId(instruction->parent()->parent());
-      if (!devices.insert(device).second) {
-        std::stringstream ss;
-        ss << "Companion set:" << std::endl;
-        for (HloInstruction* hlo : *companions) {
-          ss << "  " << hlo->name() << " ("
-             << GetModuleId(hlo->parent()->parent()) << ")" << std::endl;
+      // Go through all the communicating instructions (send, recv) of the given
+      // companion, and record their device.
+      std::unordered_set<int64> comm_devices;
+      for (HloInstruction* comm_instruction :
+           tracked_instructions_comms_.at(instruction)) {
+        auto device = GetInstructionDevice(*comm_instruction);
+        TF_RET_CHECK(device) << "Instruction " << comm_instruction->ToString()
+                             << " does not have a device";
+        comm_devices.insert(*device);
+      }
+      for (int64 device : comm_devices) {
+        if (!devices.insert(device).second) {
+          std::stringstream ss;
+          ss << "Companion set:" << std::endl;
+          for (HloInstruction* hlo : *companions) {
+            ss << "  " << hlo->name() << std::endl;
+          }
+          ss << "has multiple instructions on the same device";
+          return FailedPrecondition("%s", ss.str().c_str());
         }
-        ss << "has multiple instructions on the same device";
-        return FailedPrecondition("%s", ss.str().c_str());
       }
     }
   }
@@ -223,6 +232,21 @@ int64 HloModuleGroupMetadata::GetModuleId(const HloModule* module) const {
   LOG(FATAL) << "unknown module";
 }
 
+tensorflow::gtl::optional<int64> HloModuleGroupMetadata::GetInstructionDevice(
+    const HloInstruction& instruction) const {
+  // The module group metadata can be created in both "single module, multiple
+  // devices" and "multiple modules, no explicit devices" fashions.
+  // The API returns an optional even though the current implementation always
+  // returns a device, to account for cases where we cannot guess a device.
+  // In such cases the VerifyChannelInstructions() will return proper errors.
+  tensorflow::gtl::optional<int64> device =
+      instruction.sharding_unique_device();
+  if (!device) {
+    device = GetModuleId(instruction.parent()->parent());
+  }
+  return device;
+}
+
 Status HloModuleGroupMetadata::RecordInstructions() {
   const auto visitor = [this](HloInstruction* hlo) -> Status {
     if (hlo->opcode() == HloOpcode::kWhile) {
@@ -346,26 +370,38 @@ Status HloModuleGroupMetadata::VerifyChannelInstructions() {
     if (!ShapeUtil::Compatible(send_shape, recv_shape)) {
       return FailedPrecondition("send/recv shapes do not match");
     }
-    const HloModule* send_module = channel.send->parent()->parent();
-    const HloModule* send_done_module = channel.send_done->parent()->parent();
-    if (send_module != send_done_module) {
+    auto send_device = GetInstructionDevice(*channel.send);
+    auto send_done_device = GetInstructionDevice(*channel.send_done);
+    if (!send_device) {
+      return FailedPrecondition("send instruction must have a device: %s",
+                                channel.send->ToString().c_str());
+    }
+    if (!send_done_device) {
+      return FailedPrecondition("send_done instruction must have a device: %s",
+                                channel.send_done->ToString().c_str());
+    }
+    if (*send_device != *send_done_device) {
       return FailedPrecondition(
           "send and send-done (channel=%lld) must be on the same device: %lld "
           "vs. %lld",
-          channel.id, GetModuleId(send_module), GetModuleId(send_done_module));
+          channel.id, *send_device, *send_done_device);
+    }
+    auto recv_device = GetInstructionDevice(*channel.recv);
+    auto recv_done_device = GetInstructionDevice(*channel.recv_done);
+    if (!recv_done_device) {
+      return FailedPrecondition("recv_done instruction must have a device: %s",
+                                channel.recv_done->ToString().c_str());
     }
-    const HloModule* recv_module = channel.recv->parent()->parent();
-    const HloModule* recv_done_module = channel.recv_done->parent()->parent();
-    if (recv_module != recv_done_module) {
+    if (*recv_device != *recv_done_device) {
       return FailedPrecondition(
           "recv and recv-done (channel=%lld) must be on the same device: %lld "
           "vs. %lld",
-          channel.id, GetModuleId(recv_module), GetModuleId(recv_done_module));
+          channel.id, *recv_device, *recv_done_device);
     }
-    if (send_module == recv_module) {
+    if (*send_device == *recv_device) {
       return FailedPrecondition(
           "send and recv (channel=%lld) must be on different devices: %lld",
-          channel.id, GetModuleId(send_module));
+          channel.id, *send_device);
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 3ef4542f91..5f5bf27479 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -148,6 +149,12 @@ class HloModuleGroupMetadata {
   // the module in the module vector.
   int64 GetModuleId(const HloModule* module) const;
 
+  // Retrieves the device an instruction is assigned to. Either from the
+  // sharding information, or from the ordinal of the module the instruction
+  // is in.
+  tensorflow::gtl::optional<int64> GetInstructionDevice(
+      const HloInstruction& instruction) const;
+
   // Returns the companion instructions for the given instruction.
   //
   // Precondition: IsCompanionWhile(instruction) is true.
@@ -231,6 +238,11 @@ class HloModuleGroupMetadata {
   tensorflow::gtl::FlatMap<const HloComputation*, TrackedInstruction>
       tracked_instructions_;
 
+  // Maps tracked instructions (kWhile, kConditional, kCall, ...) to the set of
+  // communicating instructions within the proper called computation(s).
+  tensorflow::gtl::FlatMap<HloInstruction*, std::vector<HloInstruction*>>
+      tracked_instructions_comms_;
+
   // All channels in the module.
   std::vector<Channel> channels_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index ac7cd2f2f5..1fe06ee0c0 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -69,6 +69,7 @@ namespace xla {
   V(kCrossReplicaSum, "cross-replica-sum")                   \
   V(kCustomCall, "custom-call")                              \
   V(kDivide, "divide")                                       \
+  V(kDomain, "domain")                                       \
   V(kDot, "dot")                                             \
   V(kDynamicSlice, "dynamic-slice")                          \
   V(kDynamicUpdateSlice, "dynamic-update-slice")             \
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 7708422ce1..58224ef870 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -123,6 +123,24 @@ std::vector<int64> HloSharding::TileLimitForDevice(int64 device) const {
   return index;
 }
 
+StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
+    const Shape& shape) const {
+  if (IsTuple()) {
+    ShapeTree<HloSharding> result(shape, HloSharding::Replicate());
+    int64 num_leaves = result.leaf_count();
+    TF_RET_CHECK(num_leaves == tuple_elements_.size())
+        << "Shape " << ShapeUtil::HumanString(shape) << " has " << num_leaves
+        << " leaf nodes while this sharding has " << tuple_elements_.size();
+    auto it = tuple_elements_.begin();
+    for (auto& index_to_sharding : result.leaves()) {
+      index_to_sharding.second = *it++;
+    }
+    return std::move(result);
+  } else {
+    return ShapeTree<HloSharding>(shape, *this);
+  }
+}
+
 StatusOr<int64> HloSharding::UniqueDevice() const {
   if (IsTuple()) {
     if (tuple_elements_.empty()) {
@@ -367,11 +385,8 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
   Shape sub_shape = ShapeUtil::GetSubshape(shape, index);
   ShapeTree<HloSharding> sub_shape_tree(sub_shape, Replicate());
   sub_shape_tree.CopySubtreeFrom(GetAsShapeTree(shape), index, {});
-  if (ShapeUtil::IsTuple(sub_shape)) {
-    return Tuple(sub_shape_tree);
-  } else {
-    return sub_shape_tree.element({});
-  }
+  return ShapeUtil::IsTuple(sub_shape) ? Tuple(sub_shape_tree)
+                                       : sub_shape_tree.element(ShapeIndex({}));
 }
 
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index e8bb06c8f7..f4a0fb626f 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -163,19 +163,9 @@ class HloSharding {
   // tuple, if IsTuple, or a ShapeTree with a single element containing this
   // sharding. Only the leaf elements are populated. This creates a new
   // ShapeTree object so is not cheap.
+  StatusOr<ShapeTree<HloSharding>> AsShapeTree(const Shape& shape) const;
   ShapeTree<HloSharding> GetAsShapeTree(const Shape& shape) const {
-    if (IsTuple()) {
-      ShapeTree<HloSharding> result(shape, HloSharding::Replicate());
-      CHECK_EQ(std::distance(result.leaf_begin(), result.leaf_end()),
-               tuple_elements_.size());
-      auto it = tuple_elements_.begin();
-      for (auto& index_to_sharding : result.leaves()) {
-        index_to_sharding.second = *it++;
-      }
-      return result;
-    } else {
-      return ShapeTree<HloSharding>(shape, *this);
-    }
+    return AsShapeTree(shape).ValueOrDie();
   }
 
   // Retrieves the sub sharding at a given index, out of a tuple sharding.
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
new file mode 100644
index 0000000000..82cff2a4b7
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -0,0 +1,401 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+namespace {
+
+struct PassThrough {
+  PassThrough(HloInstruction* user, HloInstruction* operand)
+      : user(user), operand(operand) {}
+
+  HloInstruction* user = nullptr;
+  HloInstruction* operand = nullptr;
+};
+
+void SetDeviceSharding(HloInstruction* instruction, int64 device) {
+  VLOG(4) << "  " << instruction->name() << " to device " << device;
+  instruction->set_device_sharding(device);
+}
+
+tensorflow::gtl::optional<int64> ShardingUniqueDevice(
+    const HloSharding& sharding) {
+  if (sharding.IsTileMaximal()) {
+    auto device = sharding.UniqueDevice();
+    if (device.ok()) {
+      return device.ValueOrDie();
+    }
+  }
+  return tensorflow::gtl::optional<int64>();
+}
+
+bool ShardingMatches(const HloSharding& sharding1,
+                     const HloSharding& sharding2) {
+  auto device1 = ShardingUniqueDevice(sharding1);
+  if (device1) {
+    auto device2 = ShardingUniqueDevice(sharding2);
+    if (device2) {
+      return *device1 == *device2;
+    }
+  }
+  // Anything which is not tile maximal with unique device, gets a full sharding
+  // compare.
+  return sharding1 == sharding2;
+}
+
+// When we create domains, they are never "empty", where with empty we mean
+// that a kDomain instruction has as operand another kDomain instruction of the
+// same kind.
+// But when the HLO optimizations are run, empty domains can be created.
+// For example:
+//
+//  Domain(device=None, device=0) ->
+//    Tuple(device=0) ->
+//      GTE(device=0) ->
+//        Domain(device=0, device=None)
+//
+// In that case the tuple simplifier could create something like:
+//
+//  Domain(device=None, device=0) -> Domain(device=0, device=None)
+//
+// Which is a so called empty domain.
+// In the case above, crossing an empty domain which was transiting through
+// device 0, requires the normalization phase to fixup the empty domain by
+// adding back a Tuple+GTE pair with the proper device.
+// One particular case where this can create problems is the result of the
+// entry computation, where the GTE assignments are used by TF to tell the
+// XLA where the results should be sent.
+std::vector<PassThrough> LocatePassThroughDomainLinks(
+    const DomainMetadata::Domain& domain) {
+  std::vector<PassThrough> pass_through;
+  for (HloInstruction* instruction : domain.enter_domains) {
+    CHECK(instruction->opcode() == HloOpcode::kDomain)
+        << "Instruction is not a kDomain: " << instruction->ToString();
+    for (HloInstruction* user : instruction->users()) {
+      if (user->opcode() == HloOpcode::kDomain &&
+          domain.exit_domains.count(user) != 0) {
+        pass_through.emplace_back(user, instruction);
+        VLOG(2) << "Found passthrough domain link:";
+        VLOG(2) << "  " << user->ToString();
+        VLOG(2) << "  " << instruction->ToString();
+      }
+    }
+  }
+  return pass_through;
+}
+
+Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
+                                   const HloSharding& sharding) {
+  for (auto& pass_through : LocatePassThroughDomainLinks(domain)) {
+    HloInstruction* tuple = pass_through.operand->parent()->AddInstruction(
+        HloInstruction::CreateTuple({pass_through.operand}));
+    HloInstruction* gte = pass_through.operand->parent()->AddInstruction(
+        HloInstruction::CreateGetTupleElement(pass_through.operand->shape(),
+                                              tuple, 0));
+    gte->set_sharding(sharding);
+    TF_RETURN_IF_ERROR(
+        pass_through.operand->ReplaceUseWith(pass_through.user, gte));
+  }
+  return Status::OK();
+}
+
+std::unique_ptr<HloSharding> CloneShardingForDomain(
+    const HloSharding& sharding) {
+  auto device = ShardingUniqueDevice(sharding);
+  if (!device) {
+    return MakeUnique<HloSharding>(sharding);
+  }
+  return MakeUnique<HloSharding>(HloSharding::AssignDevice(*device));
+}
+
+Status ApplyDomainDeviceSharding(const DomainMetadata::Domain& domain,
+                                 int64 device) {
+  VLOG(4) << "Applying device " << device << " sharding";
+  for (HloInstruction* instruction : domain.instructions) {
+    // We only change instructions without sharding, since otherwise we might
+    // mess up with eventual HLO passes which has knowledge of it.
+    if (!instruction->has_sharding()) {
+      SetDeviceSharding(instruction, device);
+    } else {
+      VLOG(4) << "  " << instruction->name() << " already has sharding "
+              << instruction->sharding();
+    }
+  }
+  return Status::OK();
+}
+
+// Retrieves the sharding of a tuple shaped instruction in form of a ShapeTree.
+// If the instruction has no sharding, a ShapeTree with HloSharding::Replicate()
+// sharding will be returned.
+ShapeTree<HloSharding> GetTupleSharding(HloInstruction* tuple) {
+  if (tuple->has_sharding()) {
+    return tuple->sharding().GetAsShapeTree(tuple->shape());
+  }
+  return ShapeTree<HloSharding>(tuple->shape(), HloSharding::Replicate());
+}
+
+// Retrieves the sharding of operand, asked from a user instruction which is
+// within domain. If operand is a kDomain, it means that sharding argument is
+// the operand sharding, otherwise the operand's own sharding will be returned.
+const HloSharding* GetOperandSharding(const HloInstruction* operand,
+                                      const DomainMetadata::Domain& domain,
+                                      const HloSharding& sharding) {
+  DCHECK_EQ(domain.reach_set.count(const_cast<HloInstruction*>(operand)), 1);
+  // Here the user of operand is within the domain instruction set, and since it
+  // is user of operand, we need to look into the enter_domains set. If this is
+  // not a kDomain within the user domains set, then return the operand
+  // sharding, if any.
+  if (operand->opcode() != HloOpcode::kDomain ||
+      domain.enter_domains.count(const_cast<HloInstruction*>(operand)) == 0) {
+    return operand->has_sharding() ? &operand->sharding() : nullptr;
+  }
+  // At this point operand is a kDomain of the currently processed domain, so we
+  // can refer to sharding as the domain sharding.
+  return &sharding;
+}
+
+// Tries to propagate the sharding information into the instructions that are
+// part of the domain, in a post order manner (operand propagate to user).
+StatusOr<int64> ApplyDomainShardingPass(const DomainMetadata::Domain& domain,
+                                        const HloSharding& sharding) {
+  int64 assigned = 0;
+  for (HloInstruction* instruction : domain.instructions) {
+    if (instruction->has_sharding()) {
+      continue;
+    }
+    if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+      HloInstruction* tuple = instruction->mutable_operand(0);
+      const HloSharding* tuple_sharding =
+          GetOperandSharding(tuple, domain, sharding);
+      if (tuple_sharding != nullptr) {
+        TF_RET_CHECK(tuple_sharding->IsTuple()) << tuple->ToString();
+        HloSharding sub_sharding = tuple_sharding->GetSubSharding(
+            tuple->shape(), {instruction->tuple_index()});
+        VLOG(4) << "  " << instruction->name() << " to sharding "
+                << sub_sharding;
+        instruction->set_sharding(sub_sharding);
+        ++assigned;
+      }
+    } else if (instruction->opcode() == HloOpcode::kTuple) {
+      int64 tuple_assigned = 0;
+      ShapeTree<HloSharding> shape_tree = GetTupleSharding(instruction);
+      for (int64 i = 0; i < instruction->operand_count(); ++i) {
+        const HloSharding* operand_sharding =
+            GetOperandSharding(instruction->operand(i), domain, sharding);
+        if (operand_sharding != nullptr &&
+            shape_tree.element({i}) != *operand_sharding) {
+          *shape_tree.mutable_element({i}) = *operand_sharding;
+          ++tuple_assigned;
+        }
+      }
+      if (tuple_assigned > 0) {
+        HloSharding tuple_sharding = HloSharding::Tuple(shape_tree);
+        VLOG(4) << "  " << instruction->name() << " to sharding "
+                << tuple_sharding;
+        instruction->set_sharding(tuple_sharding);
+        ++assigned;
+      }
+    } else {
+      // If all the operand of the given instruction has the same single device
+      // assignment, assign that device to this instruction as well.
+      const HloSharding* common_sharding = nullptr;
+      for (const HloInstruction* operand : instruction->operands()) {
+        const HloSharding* operand_sharding =
+            GetOperandSharding(operand, domain, sharding);
+        if (operand_sharding != nullptr) {
+          if (common_sharding != nullptr &&
+              *common_sharding != *operand_sharding) {
+            common_sharding = nullptr;
+            break;
+          }
+          common_sharding = operand_sharding;
+        }
+      }
+      if (common_sharding != nullptr) {
+        VLOG(4) << "  " << instruction->name() << " to sharding "
+                << *common_sharding;
+        instruction->set_sharding(*common_sharding);
+        ++assigned;
+      }
+    }
+  }
+  return assigned;
+}
+
+Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
+                           const HloSharding& sharding) {
+  auto device = ShardingUniqueDevice(sharding);
+  if (device) {
+    // Shortcut the simple case. We have a unique device sharding, so we call
+    // the ApplyDomainDeviceSharding() API which will apply array or tuple
+    // shaped device sharding to the domain instructions.
+    return ApplyDomainDeviceSharding(domain, *device);
+  }
+  VLOG(1) << "Assigning non-trivial sharding " << sharding;
+  for (;;) {
+    TF_ASSIGN_OR_RETURN(int64 assigned,
+                        ApplyDomainShardingPass(domain, sharding));
+    if (assigned == 0) {
+      break;
+    }
+  }
+  int64 unassigned = 0;
+  for (HloInstruction* instruction : domain.instructions) {
+    if (!instruction->has_sharding()) {
+      LOG(WARNING) << "Unassigned instruction: " << instruction->ToString();
+      ++unassigned;
+    }
+  }
+  // Should we error out if unassigned > 0?
+  return Status::OK();
+}
+
+// Creates a kDomain instruction to be placed between instruction and operand.
+// The kDomain instruction will be created only if the sharding differ between
+// the instruction and the operand.
+std::unique_ptr<HloInstruction> CreateDomain(HloInstruction* instruction,
+                                             HloInstruction* operand) {
+  const HloSharding* instruction_sharding =
+      instruction->has_sharding() ? &instruction->sharding() : nullptr;
+  const HloSharding* operand_sharding =
+      operand->has_sharding() ? &operand->sharding() : nullptr;
+  // No need for domain if they both have no sharding.
+  if (instruction_sharding == nullptr && operand_sharding == nullptr) {
+    return nullptr;
+  }
+  // No need for domain if they match.
+  if (instruction_sharding != nullptr && operand_sharding != nullptr &&
+      ShardingMatches(*instruction_sharding, *operand_sharding)) {
+    return nullptr;
+  }
+  std::unique_ptr<HloSharding> real_instruction_sharding;
+  std::unique_ptr<HloSharding> real_operand_sharding;
+  if (instruction_sharding != nullptr) {
+    real_instruction_sharding = CloneShardingForDomain(*instruction_sharding);
+  }
+  if (operand_sharding != nullptr) {
+    real_operand_sharding = CloneShardingForDomain(*operand_sharding);
+  }
+  VLOG(3) << "Creating domain:";
+  VLOG(3) << "  Instruction: " << instruction->name();
+  VLOG(3) << "  Operand: " << operand->name();
+  VLOG(3) << "    User side sharding: "
+          << (real_instruction_sharding != nullptr
+                  ? real_instruction_sharding->ToString()
+                  : "None");
+  VLOG(3) << "    Operand side sharding: "
+          << (real_operand_sharding != nullptr
+                  ? real_operand_sharding->ToString()
+                  : "None");
+
+  std::unique_ptr<DomainMetadata> operand_side_metadata =
+      MakeUnique<ShardingMetadata>(std::move(real_operand_sharding));
+  std::unique_ptr<DomainMetadata> user_side_metadata =
+      MakeUnique<ShardingMetadata>(std::move(real_instruction_sharding));
+  return HloInstruction::CreateDomain(operand->shape(), operand,
+                                      std::move(operand_side_metadata),
+                                      std::move(user_side_metadata));
+}
+
+StatusOr<std::unique_ptr<HloSharding>> ExtractOriginalCommonSharding(
+    tensorflow::gtl::ArraySlice<HloInstruction*> instructions) {
+  // If we are here, all the instructions being passed had the same sharding
+  // (or no sharding), by the means of the ShardingMatches() API.
+  // As such, no kDomain was inserted, and here we are asked to extract the
+  // original common sharding.
+  // All the instructions passed to this API are part of the same computation.
+  const HloSharding* sharding = nullptr;
+  for (HloInstruction* instruction : instructions) {
+    if (instruction->has_sharding()) {
+      if (sharding == nullptr) {
+        sharding = &instruction->sharding();
+      } else {
+        TF_RET_CHECK(ShardingMatches(*sharding, instruction->sharding()))
+            << "Sharding " << *sharding << " does not match the one in "
+            << instruction->ToString();
+      }
+    }
+  }
+  if (sharding == nullptr) {
+    return std::unique_ptr<HloSharding>();
+  }
+  VLOG(4) << "Extracted sharding is " << *sharding;
+  return CloneShardingForDomain(*sharding);
+}
+
+}  // namespace
+
+std::unique_ptr<DomainMetadata> ShardingMetadata::Clone() const {
+  std::unique_ptr<HloSharding> sharding;
+  if (sharding_ != nullptr) {
+    sharding = MakeUnique<HloSharding>(*sharding_);
+  }
+  return MakeUnique<ShardingMetadata>(std::move(sharding));
+}
+
+bool ShardingMetadata::Matches(const DomainMetadata& other) const {
+  const ShardingMetadata* other_ptr =
+      dynamic_cast<const ShardingMetadata*>(&other);
+  if (other_ptr == nullptr) {
+    // If other is not a ShardingMetadata, then it is clearly a no match.
+    return false;
+  }
+  if (sharding_ == nullptr) {
+    return other_ptr->sharding_ == nullptr;
+  }
+  return other_ptr->sharding_ != nullptr
+             ? ShardingMatches(*sharding_, *other_ptr->sharding_)
+             : false;
+}
+
+string ShardingMetadata::ToString() const {
+  return sharding_ != nullptr ? sharding_->ToString() : "None";
+}
+
+Status ShardingMetadata::NormalizeInstructions(
+    const DomainMetadata::Domain& domain) const {
+  if (sharding_ != nullptr) {
+    VLOG(4) << "Normalizing sharding to " << sharding_->ToString() << ":";
+    TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding_));
+    TF_RETURN_IF_ERROR(FixupPassThroughDomainLinks(domain, *sharding_));
+  }
+  return Status::OK();
+}
+
+Status NormalizeShardingDomain(const DomainMetadata::Domain& domain) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloSharding> sharding,
+                      ExtractOriginalCommonSharding(domain.instructions));
+  if (sharding != nullptr) {
+    VLOG(4) << "Normalizing sharding-less domain to " << sharding->ToString()
+            << ":";
+    TF_RETURN_IF_ERROR(ApplyDomainSharding(domain, *sharding));
+  } else {
+    VLOG(1) << "Unable to find common sharding";
+  }
+  return Status::OK();
+}
+
+std::unique_ptr<HloInstruction> CreateShardingDomain(
+    HloInstruction* instruction, HloInstruction* operand) {
+  return CreateDomain(instruction, operand);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.h b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
new file mode 100644
index 0000000000..ec162c3490
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.h
@@ -0,0 +1,67 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
+
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+
+namespace xla {
+
+// A DomainMetadata implementation that internally wraps a sharding attribute.
+class ShardingMetadata : public DomainMetadata {
+ public:
+  explicit ShardingMetadata(std::unique_ptr<HloSharding> sharding)
+      : sharding_(std::move(sharding)) {}
+
+  std::unique_ptr<DomainMetadata> Clone() const override;
+
+  tensorflow::StringPiece Kind() const override { return KindName(); }
+
+  bool Matches(const DomainMetadata& other) const override;
+
+  string ToString() const override;
+
+  Status NormalizeInstructions(
+      const DomainMetadata::Domain& domain) const override;
+
+  static tensorflow::StringPiece KindName() { return "sharding"; }
+
+ private:
+  std::unique_ptr<HloSharding> sharding_;
+};
+
+// Within a set of instructions which had common sharding attributes before
+// entring the HLO passes pipeline, apply sharding heuristics and normalize the
+// instructions whose sharding deviates from the one which is inferred as to be
+// the original one.
+// Policy wise, HLO passes are allowed to create new unassigned instructions,
+// but if they do create assigned ones, they have to conform to the ones around.
+Status NormalizeShardingDomain(const DomainMetadata::Domain& domain);
+
+// Given an HLO graph edge between instruction and one of its operands, creates
+// a ShardingMetadata based kDomain instruction if the sharding between
+// instruction and operand changes. Returns nullptr if there is no need for a
+// domain separation.
+std::unique_ptr<HloInstruction> CreateShardingDomain(
+    HloInstruction* instruction, HloInstruction* operand);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_SHARDING_METADATA_H_
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 7d6d0d9eaf..9cfd8a9bf7 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -376,6 +376,7 @@ Status CheckMixedPrecisionOperands(const HloInstruction* instruction) {
     case HloOpcode::kConstant:
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
+    case HloOpcode::kDomain:
     case HloOpcode::kFusion:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kInfeed:
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 1912b8f2c7..429c850343 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -118,6 +118,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kCustomCall:
     case HloOpcode::kDivide:
+    case HloOpcode::kDomain:
     case HloOpcode::kDot:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index 6aca6ba385..f410921b4b 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -125,6 +125,12 @@ Status LogicalBufferAnalysis::HandleBitcast(HloInstruction*) {
   return Status::OK();
 }
 
+Status LogicalBufferAnalysis::HandleDomain(HloInstruction*) {
+  // A kDomain instruction aliases its operand. That is, the buffer of its
+  // result *is* the buffer of its operand.
+  return Status::OK();
+}
+
 Status LogicalBufferAnalysis::HandleRecvDone(HloInstruction*) {
   // RecvDone doesn't create a new buffer but rather aliases its input (Recv)
   // tuple element at {0} to its output.
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index f4c63dd86b..b5ef396787 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -59,6 +59,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
+  Status HandleDomain(HloInstruction* domain) override;
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 3500978bdd..d624f548b1 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -316,7 +316,8 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
 /* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
     HloOpcode opcode, const Shape& shape) {
   // There is no copy operation at the proto level, so handle copy explicitly.
-  if (opcode == HloOpcode::kCopy) {
+  // A domain shape is the same as the input one.
+  if (opcode == HloOpcode::kCopy || opcode == HloOpcode::kDomain) {
     return shape;
   }
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 8cb654493c..bb634e6573 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -273,6 +273,14 @@ Status TuplePointsToAnalysis::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
+  // A kDomain instruction aliases its operand. That is, the buffer of its
+  // result *is* the buffer of its operand, so just copy the operands points-to
+  // set.
+  CreateCopiedPointsToSet(domain, domain->operand(0));
+  return Status::OK();
+}
+
 Status TuplePointsToAnalysis::HandleSlice(HloInstruction* slice) {
   // A kSlice instruction aliases its operand if the backend lowers it to an
   // in-place implementation.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index 1ac7130136..c0d8241480 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -248,6 +248,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleTuple(HloInstruction* tuple) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleBitcast(HloInstruction* bitcast) override;
+  Status HandleDomain(HloInstruction* domain) override;
   Status HandleSlice(HloInstruction* slice) override;
   Status HandleCopy(HloInstruction* copy) override;
   Status HandleRecvDone(HloInstruction* recv_done) override;
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index 37c94ac543..5b14953ebb 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -222,6 +222,9 @@ class ShapeTree {
                     /*iterate_leaves_only=*/false);
   }
 
+  // Returns the number of leaf nodes in the tree.
+  int64 leaf_count() const { return std::distance(leaf_begin(), leaf_end()); }
+
   // Recursively traverses the shape and calls the given function at each
   // element. The function has the following arguments:
   //
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 2cdee30340..e8a28d76e9 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -880,6 +880,27 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
   return !IsTuple(GetSubshape(shape, index));
 }
 
+/* static */ int64 ShapeUtil::GetLeafCount(const Shape& shape) {
+  int64 count = 0;
+  ForEachSubshape(shape, [&](const Shape&, const ShapeIndex& index) {
+    if (IsLeafIndex(shape, index)) {
+      ++count;
+    }
+  });
+  return count;
+}
+
+/* static */ std::vector<ShapeUtil::IndexedShape> ShapeUtil::GetLeafShapes(
+    const Shape& shape) {
+  std::vector<IndexedShape> leaves;
+  ForEachSubshape(shape, [&](const Shape& sub_shape, const ShapeIndex& index) {
+    if (IsLeafIndex(shape, index)) {
+      leaves.emplace_back(index, sub_shape);
+    }
+  });
+  return leaves;
+}
+
 /* static */ Shape ShapeUtil::StripDegenerateDimensions(const Shape& shape) {
   std::vector<int64> dimension_sizes;
   std::vector<int64> degenerate_dimensions;
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index cf40068b33..9df31d5d21 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -154,6 +154,16 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index);
 // properties, which do invariant checks before / after the operation.
 class ShapeUtil {
  public:
+  // Data structure which describes the coordinates and the shape, of a tuple
+  // shaped sub-shape.
+  struct IndexedShape {
+    IndexedShape() = default;
+    IndexedShape(ShapeIndex index, Shape shape)
+        : index(std::move(index)), shape(std::move(shape)) {}
+    ShapeIndex index;
+    Shape shape;
+  };
+
   // Returns the number of elements are contained within the provided shape;
   // e.g. for rank 0 (scalars) the result is always 1. Note that sparse shapes
   // may not actually be able to store this number of elements. See
@@ -465,6 +475,13 @@ class ShapeUtil {
   // shape.
   static bool IsLeafIndex(const Shape& shape, const ShapeIndex& index);
 
+  // Returns the number of leaves in the shape.
+  static int64 GetLeafCount(const Shape& shape);
+
+  // Retrieves all the leaf shapes and their indexes, in the order walked by
+  // the ForEachSubshape() API.
+  static std::vector<IndexedShape> GetLeafShapes(const Shape& shape);
+
   // Calls the given visitor function for each subshape of the given shape.
   // Subshapes are visited in DFS pre-order starting with the entire shape
   // (index {}).
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 76c870bc98..134978d21f 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -486,6 +486,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
+    case HloOpcode::kDomain:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kImag:
-- 
GitLab


From 38aef1315cb5bf1936e979a59cd5977c1eacd9df Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 29 May 2018 21:39:20 -0700
Subject: [PATCH 041/610] internal cleanup

PiperOrigin-RevId: 198504528
---
 tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index 73d941e5e9..98cc31f18d 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -38,6 +38,7 @@ namespace {
 using ::tensorflow::io::JoinPath;
 using ::tensorflow::protobuf::util::JsonOptions;
 using ::tensorflow::protobuf::util::MessageToJsonString;
+using ::tensorflow::str_util::EndsWith;
 using ::tensorflow::strings::StrCat;
 
 constexpr char kGraphRunPrefix[] = "tpu_profiler.hlo_graph.";
@@ -46,6 +47,9 @@ constexpr char kJsonTraceFileName[] = "trace.json.gz";
 constexpr char kProfilePluginDirectory[] = "plugins/profile/";
 constexpr char kProtoTraceFileName[] = "trace";
 
+constexpr char kFlatProfilerFileName[] = "flat_profiler.pb";
+constexpr char kTfStatsHelperSuffix[] = "tf_stats_helper_result";
+
 Status WriteGzippedDataToFile(const string& filename, const string& data) {
   std::unique_ptr<WritableFile> file;
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(filename, &file));
@@ -107,6 +111,10 @@ Status DumpToolDataToLogDirectory(StringPiece run_dir,
                                   const string& host_prefix,
                                   const tensorflow::ProfileToolData& tool,
                                   std::ostream* os) {
+  // Don't save the intermediate results for combining the per host tool data.
+  if (EndsWith(tool.name(), kFlatProfilerFileName) ||
+      EndsWith(tool.name(), kTfStatsHelperSuffix))
+    return Status::OK();
   string path = JoinPath(run_dir, StrCat(host_prefix, tool.name()));
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, tool.data()));
   if (os) {
-- 
GitLab


From 73026bf564407c3f28607eb3e0c73e0b60eaf69c Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Tue, 29 May 2018 22:22:25 -0700
Subject: [PATCH 042/610] Improve log messages and fix input ordering

---
 .../contrib/tensorrt/convert/convert_nodes.cc | 34 +++++++++++++++----
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 16bfcc32a3..4026ad75fa 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -2212,9 +2212,11 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
       LOG(WARNING) << " couldn't find output node " << out_node_name;
     }
   }
-  VLOG(1) << "Input Nodes:";
-  for (auto& i : input_names) {
-    VLOG(1) << " " << i << " in graph " << node_maps.count(i);
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << c_node->name() << " Input Nodes:";
+    for (auto& i : input_names) {
+      VLOG(1) << " Input " << i << " in graph " << node_maps.count(i);
+    }
   }
   auto trt_rm = tensorflow::tensorrt::TRTResourceManager::instance();
   auto resmgr = trt_rm->getManager("TRTCalibOps");
@@ -2248,14 +2250,24 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
   calib_res->builder_ = nullptr;
   tensorflow::NodeDefBuilder op_builder(engine_name, "TRTEngineOp");
   std::vector<tensorflow::NodeDefBuilder::NodeOut> income_edges;
+  income_edges.resize(c_node->num_inputs());
   for (const auto in_edge : c_node->in_edges()) {
     auto src = in_edge->src();
     int dest_port = in_edge->dst_input();
-    income_edges.emplace_back(src->name(), in_edge->src_output(),
-                              c_node->input_type(dest_port));
+    VLOG(1) << "Incoming connection " << src->name() << ":"
+            << in_edge->src_output() << " -> " << c_node->name() << ":"
+            << dest_port;
+    income_edges.at(dest_port) = {src->name(), in_edge->src_output(),
+                                  c_node->input_type(dest_port)};
   }
   tensorflow::gtl::ArraySlice<tensorflow::NodeDefBuilder::NodeOut> input_list(
       income_edges);
+  if (VLOG_IS_ON(2)) {
+    for (const auto& inp : input_list) {
+      VLOG(2) << " Input from inputlist " << inp.node << ":" << inp.index << " "
+              << tensorflow::DataTypeString(inp.data_type);
+    }
+  }
   op_builder.Input(input_list);
   tensorflow::NodeDef engine_node;
   const char* engine_plan_data = static_cast<const char*>(engine_plan->data());
@@ -2280,11 +2292,19 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
     string s(i->src()->name());
     if (i->src_output()) StrAppend(&s, ":", i->src_output());
     int out_port = port_map.at(s);
-    VLOG(1) << "Connecting " << trt_engine_node->name() << " port " << out_port
-            << " with " << i->dst()->name() << " port " << i->dst_input();
+    VLOG(1) << "Connecting " << trt_engine_node->name() << ":" << out_port
+            << " -> " << i->dst()->name() << ":" << i->dst_input();
     TF_RETURN_IF_ERROR(
         graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
   }
+  for (const auto ed : trt_engine_node->in_edges()) {
+    VLOG(0) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
+  for (const auto ed : trt_engine_node->out_edges()) {
+    VLOG(0) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+            << " -> " << ed->dst()->name() << ":" << ed->dst_input();
+  }
   VLOG(1) << "Segment nodes:";
   for (auto& i : segment_nodes) {
     VLOG(1) << " " << i << " in graph " << node_maps.count(i);
-- 
GitLab


From 94898251aa7116774f788b5b6c9c9a618c13cea0 Mon Sep 17 00:00:00 2001
From: Smit Hinsu <hinsu@google.com>
Date: Tue, 29 May 2018 23:52:59 -0700
Subject: [PATCH 043/610] Fix GPU build on windows

PiperOrigin-RevId: 198513480
---
 tensorflow/stream_executor/cuda/cuda_driver.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_driver.cc b/tensorflow/stream_executor/cuda/cuda_driver.cc
index 09e9f9f758..d508f6594a 100644
--- a/tensorflow/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/stream_executor/cuda/cuda_driver.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include <set>
 #include <utility>
 
-#include "cuda/include/cuda_runtime.h"
 #include "tensorflow/stream_executor/cuda/cuda_diagnostics.h"
 #include "tensorflow/stream_executor/lib/casts.h"
 #include "tensorflow/stream_executor/lib/env.h"
-- 
GitLab


From 28e694db5b549e1ec1e6a7c38fda053c31a87ccb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 00:06:26 -0700
Subject: [PATCH 044/610] Improve error message when a missing feature name is
 passed as a unicode string.

PiperOrigin-RevId: 198514621
---
 tensorflow/python/feature_column/feature_column.py      | 2 +-
 tensorflow/python/feature_column/feature_column_test.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index ffcb9990d5..7aa46af828 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -2163,7 +2163,7 @@ class _LazyBuilder(object):
       self._feature_tensors[key] = feature_tensor
       return feature_tensor
 
-    if isinstance(key, str):
+    if isinstance(key, six.string_types):
       raise ValueError('Feature {} is not in features dictionary.'.format(key))
 
     if not isinstance(key, _FeatureColumn):
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index f9206f4f38..0af7b9baa9 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -137,6 +137,9 @@ class LazyColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError,
                                  'bbb is not in features dictionary'):
       builder.get('bbb')
+    with self.assertRaisesRegexp(ValueError,
+                                 'bbb is not in features dictionary'):
+      builder.get(u'bbb')
 
   def test_not_supported_feature_column(self):
 
-- 
GitLab


From bca9ebc670544ea169651200b34f9dc3cda44eb8 Mon Sep 17 00:00:00 2001
From: Brian Patton <bjp@google.com>
Date: Wed, 30 May 2018 00:14:37 -0700
Subject: [PATCH 045/610] Adds GPU kernel registration for igamma, igammac.

Switches use_gpu=True to force_gpu=True for cwise_ops_test.

PiperOrigin-RevId: 198515293
---
 tensorflow/core/kernels/cwise_op_igammas.cc   |  4 ++
 .../python/kernel_tests/cwise_ops_test.py     | 46 ++++++++++++-------
 2 files changed, 34 insertions(+), 16 deletions(-)

diff --git a/tensorflow/core/kernels/cwise_op_igammas.cc b/tensorflow/core/kernels/cwise_op_igammas.cc
index a1d7f4dad4..4b5f888bc1 100644
--- a/tensorflow/core/kernels/cwise_op_igammas.cc
+++ b/tensorflow/core/kernels/cwise_op_igammas.cc
@@ -18,4 +18,8 @@ limitations under the License.
 namespace tensorflow {
 REGISTER2(BinaryOp, CPU, "Igamma", functor::igamma, float, double);
 REGISTER2(BinaryOp, CPU, "Igammac", functor::igammac, float, double);
+#if GOOGLE_CUDA
+REGISTER2(BinaryOp, GPU, "Igamma", functor::igamma, float, double);
+REGISTER2(BinaryOp, GPU, "Igammac", functor::igammac, float, double);
+#endif
 }  // namespace tensorflow
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 87da89831c..1128cd7a63 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gradient_checker
@@ -152,7 +153,7 @@ class UnaryOpTest(test.TestCase):
 
   def _compareGpu(self, x, np_func, tf_func):
     np_ans = np_func(x)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       result = tf_func(ops.convert_to_tensor(x))
       tf_gpu = result.eval()
     if x.dtype == np.float16:
@@ -164,7 +165,7 @@ class UnaryOpTest(test.TestCase):
   def _compareSparseGpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareBoth(self, x, np_func, tf_func):
@@ -630,7 +631,7 @@ class BinaryOpTest(test.TestCase):
 
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
@@ -1203,7 +1204,7 @@ class BinaryOpTest(test.TestCase):
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
@@ -1236,7 +1237,7 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=True):
+    with self.test_session(force_gpu=test_util.is_gpu_available()):
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
       tf_ans = out.eval()
     self.assertAllEqual(np_ans, tf_ans)
@@ -1337,7 +1338,8 @@ class LogicalOpTest(test.TestCase):
 
   def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
@@ -1348,7 +1350,8 @@ class LogicalOpTest(test.TestCase):
 
   def _not(self, x, use_gpu=False):
     np_ans = np.logical_not(x)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       out = math_ops.logical_not(ops.convert_to_tensor(x))
       tf_val = out.eval()
     self.assertEqual(out.dtype, dtypes_lib.bool)
@@ -1433,7 +1436,8 @@ class SelectOpTest(test.TestCase):
 
   def _compare(self, c, x, y, use_gpu):
     np_ans = np.where(c, x, y)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       out = array_ops.where(c, x, y)
       tf_ans = out.eval()
     self.assertAllEqual(np_ans, tf_ans)
@@ -1576,7 +1580,8 @@ class BatchSelectOpTest(test.TestCase):
     np_ans = np.dstack(
         [x_i if c_i else y_i for c_i, x_i, y_i in zip(c, x, y)]).transpose(
             [2, 0, 1])
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       out = array_ops.where(c, x, y)
       tf_ans = out.eval()
     self.assertAllEqual(np_ans, tf_ans)
@@ -1681,7 +1686,9 @@ class MinMaxOpTest(test.TestCase):
 
   def _compare(self, x, y, use_gpu):
     np_min, np_max = np.minimum(x, y), np.maximum(x, y)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(
+        use_gpu=use_gpu,
+        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
@@ -1843,7 +1850,9 @@ class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
     np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(
+        use_gpu=use_gpu,
+        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
@@ -1884,7 +1893,7 @@ class IsFiniteInfNanTest(test.TestCase):
           x = np.full((size,), value, dtype=dtype)
           np_y = np.sqrt(x)
           np_nan = np.isnan(np_y)
-          with self.test_session(use_gpu=True):
+          with self.test_session(force_gpu=test_util.is_gpu_available()):
             tf_y = math_ops.sqrt(x)
             tf_nan = math_ops.is_nan(tf_y)
             if value < 0:
@@ -1939,7 +1948,8 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareMake(self, real, imag, use_gpu):
     np_ans = real + (1j) * imag
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       real = ops.convert_to_tensor(real)
       imag = ops.convert_to_tensor(imag)
       tf_ans = math_ops.complex(real, imag)
@@ -1958,7 +1968,8 @@ class ComplexMakeRealImagTest(test.TestCase):
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
     np_zeros = np_real * 0
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(cplx)
       tf_real = math_ops.real(inx)
       tf_imag = math_ops.imag(inx)
@@ -1985,7 +1996,9 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareAngle(self, cplx, use_gpu):
     np_angle = np.angle(cplx)
-    with self.test_session(use_gpu=use_gpu) as sess:
+    with self.test_session(
+        use_gpu=use_gpu,
+        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
       inx = ops.convert_to_tensor(cplx)
       tf_angle = math_ops.angle(inx)
       tf_angle_val = sess.run(tf_angle)
@@ -2019,7 +2032,8 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
-    with self.test_session(use_gpu=use_gpu):
+    with self.test_session(use_gpu=use_gpu,
+                           force_gpu=use_gpu and test_util.is_gpu_available()):
       inx = ops.convert_to_tensor(cplx)
       tf_conj = math_ops.conj(inx)
       tf_ans = tf_conj.eval()
-- 
GitLab


From 786ad688b7378aac40be8c785f7e69a0b0fb0223 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 00:58:29 -0700
Subject: [PATCH 046/610] Remove unused Make variables from tf_py_wrap_cc.

PiperOrigin-RevId: 198518885
---
 tensorflow/tensorflow.bzl | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index d71fd71bbd..522965990b 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1353,12 +1353,6 @@ register_extension_info(
     label_regex_for_dep = "{extension_name}",
 )
 
-def tf_extension_linkopts():
-  return []  # No extension link opts
-
-def tf_extension_copts():
-  return []  # No extension c opts
-
 # In tf_py_wrap_cc generated libraries
 # module init functions are not exported unless
 # they contain one of the keywords in the version file
@@ -1459,10 +1453,10 @@ def tf_py_wrap_cc(name,
   tf_cc_shared_object(
       name=cc_library_name,
       srcs=[module_name + ".cc"],
-      copts=(copts + if_not_windows([
+      copts=copts + if_not_windows([
           "-Wno-self-assign", "-Wno-sign-compare", "-Wno-write-strings"
-      ]) + tf_extension_copts()),
-      linkopts=tf_extension_linkopts() + extra_linkopts,
+      ]),
+      linkopts=extra_linkopts,
       linkstatic=1,
       deps=deps + extra_deps,
       **kwargs)
-- 
GitLab


From 1d2b40c2fd00acc2262554d3bf6e7368125db25b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 07:13:33 -0700
Subject: [PATCH 047/610] beautify test output file name.

PiperOrigin-RevId: 198555383
---
 tensorflow/contrib/lite/testing/generate_examples.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 0e036bda92..13fafebd1d 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -385,7 +385,7 @@ def make_zip_of_tests(zip_path,
   for parameters in test_parameters:
     keys = parameters.keys()
     for curr in itertools.product(*parameters.values()):
-      label = zip_path.replace(".zip", "") + (",".join(
+      label = zip_path.replace(".zip", "_") + (",".join(
           "%s=%r" % z for z in sorted(zip(keys, curr))).replace(" ", ""))
       if label[0] == "/":
         label = label[1:]
-- 
GitLab


From bc8bc83b593754bf3c56c67d4cf972386b7a2937 Mon Sep 17 00:00:00 2001
From: Mustafa Ispir <ispir@google.com>
Date: Wed, 30 May 2018 08:00:34 -0700
Subject: [PATCH 048/610] internal

PiperOrigin-RevId: 198560342
---
 tensorflow/contrib/estimator/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index d5d2abf8c4..47c7b7fc19 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -340,6 +340,7 @@ py_test(
     size = "medium",
     srcs = ["python/estimator/hooks_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":hooks",
         "//tensorflow/python:client_testlib",
-- 
GitLab


From 7a002241a81925dca83e3447e766e2b60fabe77e Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 30 May 2018 15:36:40 +0000
Subject: [PATCH 049/610] Add normalizer_fn support for sequence_numeric_column

This fix tries to address the issue raised in 19628
where there were no normalizer_fn support for sequence_numeric_column
(unlike numeric_column). This fix adds the normalizer_fn support.

This fix fixes 19628.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../feature_column/sequence_feature_column.py     | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 555beddeaa..ec16b461af 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -346,7 +346,8 @@ def sequence_numeric_column(
     key,
     shape=(1,),
     default_value=0.,
-    dtype=dtypes.float32):
+    dtype=dtypes.float32,
+    normalizer_fn=None):
   """Returns a feature column that represents sequences of numeric data.
 
   Example:
@@ -383,12 +384,15 @@ def sequence_numeric_column(
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
+  if normalizer_fn is not None and not callable(normalizer_fn):
+    raise TypeError('normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
       shape=shape,
       default_value=default_value,
-      dtype=dtype)
+      dtype=dtype,
+      normalizer_fn=normalizer_fn)
 
 
 def _assert_all_equal_and_return(tensors, name=None):
@@ -407,7 +411,7 @@ class _SequenceNumericColumn(
     fc._SequenceDenseColumn,
     collections.namedtuple(
         '_SequenceNumericColumn',
-        ['key', 'shape', 'default_value', 'dtype'])):
+        ['key', 'shape', 'default_value', 'dtype', 'normalizer_fn'])):
   """Represents sequences of numeric data."""
 
   @property
@@ -419,7 +423,10 @@ class _SequenceNumericColumn(
     return {self.key: parsing_ops.VarLenFeature(self.dtype)}
 
   def _transform_feature(self, inputs):
-    return inputs.get(self.key)
+    input_tensor = inputs.get(self.key)
+    if self.normalizer_fn is not None:
+      input_tensor = self.normalizer_fn(input_tensor)
+    return input_tensor
 
   @property
   def _variable_shape(self):
-- 
GitLab


From 2469ba8003194f92829f4119718f9ce2efd9eae9 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 30 May 2018 15:39:21 +0000
Subject: [PATCH 050/610] Update docstring for sequence_feature_column

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/feature_column/sequence_feature_column.py        | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index ec16b461af..2bca906b7f 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -371,6 +371,12 @@ def sequence_numeric_column(
     default_value: A single value compatible with `dtype` that is used for
       padding the sparse data into a dense `Tensor`.
     dtype: The type of values.
+    normalizer_fn: If not `None`, a function that can be used to normalize the
+      value of the tensor after `default_value` is applied for parsing.
+      Normalizer function takes the input `Tensor` as its argument, and returns
+      the output `Tensor`. (e.g. lambda x: (x - 3.0) / 4.2). Please note that
+      even though the most common use case of this function is normalization, it
+      can be used for any kind of Tensorflow transformations.
 
   Returns:
     A `_SequenceNumericColumn`.
-- 
GitLab


From a8873e090ef42e20be925821d4942b2cbba44382 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 30 May 2018 15:39:41 +0000
Subject: [PATCH 051/610] Add test case for normalizer_fn support with
 sequence_feature_column

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../sequence_feature_column_test.py           | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 88f5d53516..57682c488e 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 
@@ -670,6 +671,7 @@ class SequenceNumericColumnTest(test.TestCase):
     self.assertEqual((1,), a.shape)
     self.assertEqual(0., a.default_value)
     self.assertEqual(dtypes.float32, a.dtype)
+    self.assertIsNone(a.normalizer_fn)
 
   def test_shape_saved_as_tuple(self):
     a = sfc.sequence_numeric_column('aaa', shape=[1, 2])
@@ -688,6 +690,10 @@ class SequenceNumericColumnTest(test.TestCase):
         ValueError, 'dtype must be convertible to float'):
       sfc.sequence_numeric_column('aaa', dtype=dtypes.string)
 
+  def test_normalizer_fn_must_be_callable(self):
+    with self.assertRaisesRegexp(TypeError, 'must be a callable'):
+      sfc.sequence_numeric_column('aaa', normalizer_fn='NotACallable')
+
   def test_get_sequence_dense_tensor(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, values [[0.], [1]]
@@ -708,6 +714,40 @@ class SequenceNumericColumnTest(test.TestCase):
       self.assertAllEqual(
           expected_dense_tensor, dense_tensor.eval(session=sess))
 
+  def test_get_sequence_dense_tensor_with_normalizer_fn(self):
+
+    def _increment_two(input_sparse_tensor):
+      return sparse_ops.sparse_add(
+          input_sparse_tensor,
+          sparse_tensor.SparseTensor(((0, 0), (1, 1)), (2.0, 2.0), (2, 2))
+      )
+
+    sparse_input = sparse_tensor.SparseTensorValue(
+        # example 0, values [[0.], [1]]
+        # example 1, [[10.]]
+        indices=((0, 0), (0, 1), (1, 0)),
+        values=(0., 1., 10.),
+        dense_shape=(2, 2))
+
+    # Before _increment_two:
+    #   [[0.], [1.]],
+    #   [[10.], [0.]],
+    # After _increment_two:
+    #   [[2.], [1.]],
+    #   [[10.], [2.]],
+    expected_dense_tensor = [
+        [[2.], [1.]],
+        [[10.], [2.]],
+    ]
+    numeric_column = sfc.sequence_numeric_column('aaa', normalizer_fn=_increment_two)
+
+    dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
+        _LazyBuilder({'aaa': sparse_input}))
+
+    with monitored_session.MonitoredSession() as sess:
+      self.assertAllEqual(
+          expected_dense_tensor, dense_tensor.eval(session=sess))
+
   def test_get_sequence_dense_tensor_with_shape(self):
     """Tests get_sequence_dense_tensor with shape !=(1,)."""
     sparse_input = sparse_tensor.SparseTensorValue(
-- 
GitLab


From e87cfa2600bf5117befb16a72f05642d967eb77d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Wed, 30 May 2018 15:41:41 +0000
Subject: [PATCH 052/610] Pylint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../python/feature_column/sequence_feature_column.py           | 3 ++-
 .../python/feature_column/sequence_feature_column_test.py      | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index 2bca906b7f..b588f75efe 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -391,7 +391,8 @@ def sequence_numeric_column(
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
   if normalizer_fn is not None and not callable(normalizer_fn):
-    raise TypeError('normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
+    raise TypeError(
+        'normalizer_fn must be a callable. Given: {}'.format(normalizer_fn))
 
   return _SequenceNumericColumn(
       key,
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 57682c488e..89b5f4c413 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -739,7 +739,8 @@ class SequenceNumericColumnTest(test.TestCase):
         [[2.], [1.]],
         [[10.], [2.]],
     ]
-    numeric_column = sfc.sequence_numeric_column('aaa', normalizer_fn=_increment_two)
+    numeric_column = sfc.sequence_numeric_column(
+        'aaa', normalizer_fn=_increment_two)
 
     dense_tensor, _ = numeric_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
-- 
GitLab


From 34635a4d461657f1aa7c38f6f6db080c9af84b3b Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 30 May 2018 08:53:16 -0700
Subject: [PATCH 053/610] [tf.data] Adding a concurrency stress test for
 `map_and_batch`.

PiperOrigin-RevId: 198566777
---
 .../kernel_tests/batch_dataset_op_test.py     | 38 +++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index 2568b899d7..e309d611e1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -552,6 +552,44 @@ class BatchDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  def testMapAndBatchParallelGetNext(self):
+    iterator = (dataset_ops.Dataset.range(500000)
+                .apply(batching.map_and_batch(lambda x: x, batch_size=100))
+                .make_one_shot_iterator())
+    elements = []
+    for _ in range(100):
+      elements.append(iterator.get_next())
+    with self.test_session() as sess:
+      for i in range(50):
+        got = sess.run(elements)
+        got.sort(key=lambda x: x[0])
+        expected = []
+        for j in range(100):
+          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+        self.assertAllEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elements)
+
+  def testMapAndBatchParallelGetNextDropRemainder(self):
+    iterator = (
+        dataset_ops.Dataset.range(499999).apply(
+            batching.map_and_batch(
+                lambda x: x, batch_size=100, drop_remainder=True))
+        .make_one_shot_iterator())
+    elements = []
+    for _ in range(100):
+      elements.append(iterator.get_next())
+    with self.test_session() as sess:
+      for i in range(49):
+        got = sess.run(elements)
+        got.sort(key=lambda x: x[0])
+        expected = []
+        for j in range(100):
+          expected.append(range(i*10000+j*100, i*10000+(j+1)*100))
+        self.assertAllEqual(got, expected)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(elements)
+
   def testMapAndBatchSparse(self):
 
     def _sparse(i):
-- 
GitLab


From 6c582c5b087de1329febcecc4556d812acd5e511 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 09:14:12 -0700
Subject: [PATCH 054/610] Adding tf.name_scope blocks to make the TensorBoard
 graph visualization usable.

PiperOrigin-RevId: 198569786
---
 .../python/ops/factorization_ops.py           | 99 ++++++++++---------
 1 file changed, 52 insertions(+), 47 deletions(-)

diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 5cef4068ed..09745e2de5 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -265,11 +265,14 @@ class WALSModel(object):
         "col_factors")
     self._row_gramian = self._create_gramian(self._n_components, "row_gramian")
     self._col_gramian = self._create_gramian(self._n_components, "col_gramian")
-    self._row_update_prep_gramian = self._prepare_gramian(
-        self._col_factors, self._col_gramian)
-    self._col_update_prep_gramian = self._prepare_gramian(
-        self._row_factors, self._row_gramian)
-    self._create_transient_vars()
+    with ops.name_scope("row_prepare_gramian"):
+      self._row_update_prep_gramian = self._prepare_gramian(
+          self._col_factors, self._col_gramian)
+    with ops.name_scope("col_prepare_gramian"):
+      self._col_update_prep_gramian = self._prepare_gramian(
+          self._row_factors, self._row_gramian)
+    with ops.name_scope("transient_vars"):
+      self._create_transient_vars()
 
   @property
   def row_factors(self):
@@ -310,36 +313,37 @@ class WALSModel(object):
   @classmethod
   def _create_factors(cls, rows, cols, num_shards, init, name):
     """Helper function to create row and column factors."""
-    if callable(init):
-      init = init()
-    if isinstance(init, list):
-      assert len(init) == num_shards
-    elif isinstance(init, str) and init == "random":
-      pass
-    elif num_shards == 1:
-      init = [init]
-    sharded_matrix = []
-    sizes = cls._shard_sizes(rows, num_shards)
-    assert len(sizes) == num_shards
-
-    def make_initializer(i, size):
-
-      def initializer():
-        if init == "random":
-          return random_ops.random_normal([size, cols])
-        else:
-          return init[i]
+    with ops.name_scope(name):
+      if callable(init):
+        init = init()
+      if isinstance(init, list):
+        assert len(init) == num_shards
+      elif isinstance(init, str) and init == "random":
+        pass
+      elif num_shards == 1:
+        init = [init]
+      sharded_matrix = []
+      sizes = cls._shard_sizes(rows, num_shards)
+      assert len(sizes) == num_shards
+
+      def make_initializer(i, size):
+
+        def initializer():
+          if init == "random":
+            return random_ops.random_normal([size, cols])
+          else:
+            return init[i]
 
-      return initializer
+        return initializer
 
-    for i, size in enumerate(sizes):
-      var_name = "%s_shard_%d" % (name, i)
-      var_init = make_initializer(i, size)
-      sharded_matrix.append(
-          variable_scope.variable(
-              var_init, dtype=dtypes.float32, name=var_name))
+      for i, size in enumerate(sizes):
+        var_name = "%s_shard_%d" % (name, i)
+        var_init = make_initializer(i, size)
+        sharded_matrix.append(
+            variable_scope.variable(
+                var_init, dtype=dtypes.float32, name=var_name))
 
-    return sharded_matrix
+      return sharded_matrix
 
   @classmethod
   def _create_weights(cls, wt_init, num_wts, num_shards, name):
@@ -380,25 +384,26 @@ class WALSModel(object):
     sizes = cls._shard_sizes(num_wts, num_shards)
     assert len(sizes) == num_shards
 
-    def make_wt_initializer(i, size):
+    with ops.name_scope(name):
+      def make_wt_initializer(i, size):
 
-      def initializer():
-        if init_mode == "scalar":
-          return wt_init * array_ops.ones([size])
-        else:
-          return wt_init[i]
+        def initializer():
+          if init_mode == "scalar":
+            return wt_init * array_ops.ones([size])
+          else:
+            return wt_init[i]
 
-      return initializer
+        return initializer
 
-    sharded_weight = []
-    for i, size in enumerate(sizes):
-      var_name = "%s_shard_%d" % (name, i)
-      var_init = make_wt_initializer(i, size)
-      sharded_weight.append(
-          variable_scope.variable(
-              var_init, dtype=dtypes.float32, name=var_name))
+      sharded_weight = []
+      for i, size in enumerate(sizes):
+        var_name = "%s_shard_%d" % (name, i)
+        var_init = make_wt_initializer(i, size)
+        sharded_weight.append(
+            variable_scope.variable(
+                var_init, dtype=dtypes.float32, name=var_name))
 
-    return sharded_weight
+      return sharded_weight
 
   @staticmethod
   def _create_gramian(n_components, name):
-- 
GitLab


From 5eb510994043d1342170f657860196be0b7ed15c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 09:39:57 -0700
Subject: [PATCH 055/610] KL divergence for two Dirichlet distributions.

PiperOrigin-RevId: 198573236
---
 .../distributions/dirichlet_test.py           | 35 +++++++++
 .../python/ops/distributions/dirichlet.py     | 78 +++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
index 3bcfae0deb..bcec6ef610 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import dirichlet as dirichlet_lib
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -39,6 +40,7 @@ def try_import(name):  # pylint: disable=invalid-name
   return module
 
 
+special = try_import("scipy.special")
 stats = try_import("scipy.stats")
 
 
@@ -262,6 +264,39 @@ class DirichletTest(test.TestCase):
                   a=1., b=2.).cdf)[0],
           0.01)
 
+  def testDirichletDirichletKL(self):
+    conc1 = np.array([[1., 2., 3., 1.5, 2.5, 3.5],
+                      [1.5, 2.5, 3.5, 4.5, 5.5, 6.5]])
+    conc2 = np.array([[0.5, 1., 1.5, 2., 2.5, 3.]])
+
+    d1 = dirichlet_lib.Dirichlet(conc1)
+    d2 = dirichlet_lib.Dirichlet(conc2)
+    x = d1.sample(int(1e4), seed=0)
+    kl_sample = math_ops.reduce_mean(d1.log_prob(x) - d2.log_prob(x), 0)
+    kl_actual = kullback_leibler.kl_divergence(d1, d2)
+
+    kl_sample_val = self.evaluate(kl_sample)
+    kl_actual_val = self.evaluate(kl_actual)
+
+    self.assertEqual(conc1.shape[:-1], kl_actual.get_shape())
+
+    if not special:
+      return
+
+    kl_expected = (
+        special.gammaln(np.sum(conc1, -1))
+        - special.gammaln(np.sum(conc2, -1))
+        - np.sum(special.gammaln(conc1) - special.gammaln(conc2), -1)
+        + np.sum((conc1 - conc2) * (special.digamma(conc1) - special.digamma(
+            np.sum(conc1, -1, keepdims=True))), -1))
+
+    self.assertAllClose(kl_expected, kl_actual_val, atol=0., rtol=1e-6)
+    self.assertAllClose(kl_sample_val, kl_actual_val, atol=0., rtol=1e-1)
+
+    # Make sure KL(d1||d1) is 0
+    kl_same = self.evaluate(kullback_leibler.kl_divergence(d1, d1))
+    self.assertAllClose(kl_same, np.zeros_like(kl_expected))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/distributions/dirichlet.py b/tensorflow/python/ops/distributions/dirichlet.py
index 1ab58c1450..72567e62f7 100644
--- a/tensorflow/python/ops/distributions/dirichlet.py
+++ b/tensorflow/python/ops/distributions/dirichlet.py
@@ -28,6 +28,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
+from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.util.tf_export import tf_export
 
@@ -297,3 +298,80 @@ class Dirichlet(distribution.Distribution):
             math_ops.reduce_sum(x, -1),
             message="sample last-dimension must sum to `1`"),
     ], x)
+
+
+@kullback_leibler.RegisterKL(Dirichlet, Dirichlet)
+def _kl_dirichlet_dirichlet(d1, d2, name=None):
+  """Batchwise KL divergence KL(d1 || d2) with d1 and d2 Dirichlet.
+
+  Args:
+    d1: instance of a Dirichlet distribution object.
+    d2: instance of a Dirichlet distribution object.
+    name: (optional) Name to use for created operations.
+      default is "kl_dirichlet_dirichlet".
+
+  Returns:
+    Batchwise KL(d1 || d2)
+  """
+  with ops.name_scope(name, "kl_dirichlet_dirichlet", values=[
+      d1.concentration, d2.concentration]):
+    # The KL between Dirichlet distributions can be derived as follows. We have
+    #
+    #   Dir(x; a) = 1 / B(a) * prod_i[x[i]^(a[i] - 1)]
+    #
+    # where B(a) is the multivariate Beta function:
+    #
+    #   B(a) = Gamma(a[1]) * ... * Gamma(a[n]) / Gamma(a[1] + ... + a[n])
+    #
+    # The KL is
+    #
+    #   KL(Dir(x; a), Dir(x; b)) = E_Dir(x; a){log(Dir(x; a) / Dir(x; b))}
+    #
+    # so we'll need to know the log density of the Dirichlet. This is
+    #
+    #   log(Dir(x; a)) = sum_i[(a[i] - 1) log(x[i])] - log B(a)
+    #
+    # The only term that matters for the expectations is the log(x[i]). To
+    # compute the expectation of this term over the Dirichlet density, we can
+    # use the following facts about the Dirichlet in exponential family form:
+    #   1. log(x[i]) is a sufficient statistic
+    #   2. expected sufficient statistics (of any exp family distribution) are
+    #      equal to derivatives of the log normalizer with respect to
+    #      corresponding natural parameters: E{T[i](x)} = dA/d(eta[i])
+    #
+    # To proceed, we can rewrite the Dirichlet density in exponential family
+    # form as follows:
+    #
+    #   Dir(x; a) = exp{eta(a) . T(x) - A(a)}
+    #
+    # where '.' is the dot product of vectors eta and T, and A is a scalar:
+    #
+    #   eta[i](a) = a[i] - 1
+    #     T[i](x) = log(x[i])
+    #        A(a) = log B(a)
+    #
+    # Now, we can use fact (2) above to write
+    #
+    #   E_Dir(x; a)[log(x[i])]
+    #       = dA(a) / da[i]
+    #       = d/da[i] log B(a)
+    #       = d/da[i] (sum_j lgamma(a[j])) - lgamma(sum_j a[j])
+    #       = digamma(a[i])) - digamma(sum_j a[j])
+    #
+    # Putting it all together, we have
+    #
+    # KL[Dir(x; a) || Dir(x; b)]
+    #     = E_Dir(x; a){log(Dir(x; a) / Dir(x; b)}
+    #     = E_Dir(x; a){sum_i[(a[i] - b[i]) log(x[i])} - (lbeta(a) - lbeta(b))
+    #     = sum_i[(a[i] - b[i]) * E_Dir(x; a){log(x[i])}] - lbeta(a) + lbeta(b)
+    #     = sum_i[(a[i] - b[i]) * (digamma(a[i]) - digamma(sum_j a[j]))]
+    #          - lbeta(a) + lbeta(b))
+
+    digamma_sum_d1 = math_ops.digamma(
+        math_ops.reduce_sum(d1.concentration, axis=-1, keepdims=True))
+    digamma_diff = math_ops.digamma(d1.concentration) - digamma_sum_d1
+    concentration_diff = d1.concentration - d2.concentration
+
+    return (math_ops.reduce_sum(concentration_diff * digamma_diff, axis=-1) -
+            special_math_ops.lbeta(d1.concentration) +
+            special_math_ops.lbeta(d2.concentration))
-- 
GitLab


From 2bb9fe8d202b2400219d45a8a2185a02dd070fb5 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 30 May 2018 10:35:57 -0700
Subject: [PATCH 056/610] Disable flaky fused_rnn_cell_test

PiperOrigin-RevId: 198582181
---
 tensorflow/contrib/rnn/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/rnn/BUILD b/tensorflow/contrib/rnn/BUILD
index 43c0f75955..4eb5c920b3 100644
--- a/tensorflow/contrib/rnn/BUILD
+++ b/tensorflow/contrib/rnn/BUILD
@@ -193,6 +193,10 @@ tf_py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
     ],
+    tags = [
+        "manual",
+        "notap",
+    ],
 )
 
 cuda_py_tests(
-- 
GitLab


From 81755953863f36f13d1c70a108469b0c3f5fa697 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 10:40:39 -0700
Subject: [PATCH 057/610] Internal change

PiperOrigin-RevId: 198582954
---
 .../xla/service/cpu/parallel_task_assignment.cc |  6 +++---
 .../compiler/xla/service/cpu/shape_partition.cc |  2 +-
 .../xla/service/hlo_evaluator_typed_visitor.h   |  2 +-
 .../lib/quantiles/weighted_quantiles_stream.h   |  4 ++--
 .../lib/quantiles/weighted_quantiles_summary.h  |  4 ++--
 .../core/common_runtime/gpu/gpu_device.cc       |  2 +-
 tensorflow/core/framework/common_shape_fns.cc   |  4 ++--
 tensorflow/core/kernels/cholesky_grad.cc        |  2 +-
 tensorflow/core/kernels/deep_conv2d.cc          | 17 +++++++++--------
 tensorflow/core/kernels/draw_bounding_box_op.cc |  4 ++--
 tensorflow/core/kernels/lrn_op_test.cc          |  2 +-
 tensorflow/core/kernels/matrix_band_part_op.cc  |  2 +-
 tensorflow/core/kernels/pooling_ops_common.h    |  2 +-
 tensorflow/core/kernels/quantization_utils.h    |  4 ++--
 tensorflow/core/kernels/resize_area_op.cc       |  2 +-
 tensorflow/core/kernels/resize_bicubic_op.cc    |  2 +-
 .../core/kernels/resize_bicubic_op_test.cc      |  2 +-
 .../core/kernels/sparse_fill_empty_rows_op.cc   |  2 +-
 tensorflow/core/platform/cloud/gcs_throttle.cc  |  2 +-
 tensorflow/core/util/work_sharder.cc            |  2 +-
 20 files changed, 35 insertions(+), 34 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
index 63d0f7b95c..4fa5984b04 100644
--- a/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
+++ b/tensorflow/compiler/xla/service/cpu/parallel_task_assignment.cc
@@ -38,7 +38,7 @@ class SimpleCostModel : public ParallelCostModel {
     const int64 min_cost_per_thread = 256LL << 10;  // 256KB L2 Cache size.
     // Return target parallel task count in [1, max_parallelism_].
     return std::min(max_parallelism_,
-                    std::max(1LL, instruction_cost / min_cost_per_thread));
+                    std::max(int64{1}, instruction_cost / min_cost_per_thread));
   }
 
  private:
@@ -63,7 +63,7 @@ class DefaultCostModel : public ParallelCostModel {
     int64 max_parallelism;
     // Calculate flops-to-bytes-ratio for 'instruction'.
     const int64 bytes_accessed =
-        std::max(1LL, cost_analysis_->bytes_accessed(*instruction));
+        std::max(int64{1}, cost_analysis_->bytes_accessed(*instruction));
     const float flops_to_bytes_ratio =
         cost_analysis_->flop_count(*instruction) /
         static_cast<float>(bytes_accessed);
@@ -93,7 +93,7 @@ class DefaultCostModel : public ParallelCostModel {
     }
     // Return target parallel task count in [1, max_parallelism_].
     return std::min(max_parallelism,
-                    std::max(1LL, instruction_cost / min_cost_per_thread));
+                    std::max(int64{1}, instruction_cost / min_cost_per_thread));
   }
 
  private:
diff --git a/tensorflow/compiler/xla/service/cpu/shape_partition.cc b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
index 42fe955f19..d12c539614 100644
--- a/tensorflow/compiler/xla/service/cpu/shape_partition.cc
+++ b/tensorflow/compiler/xla/service/cpu/shape_partition.cc
@@ -115,7 +115,7 @@ ShapePartitionIterator::ShapePartitionIterator(
   for (int i = 0; i < dimension_partition_sizes_.size(); ++i) {
     const int64 dim_size = shape_.dimensions(dimensions_[i]);
     dimension_partition_sizes_[i] =
-        std::max(1LL, dim_size / dimension_partition_counts_[i]);
+        std::max(int64{1}, dim_size / dimension_partition_counts_[i]);
   }
 
   // Calculate the partition strides for each dimension.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index 82ee77e1ae..b1b58642ec 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1965,7 +1965,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     // to oficially document different behavior.
     for (int64 i = 0; i < start.size(); ++i) {
       start[i] = std::min<int64>(
-          std::max(0LL, start[i]),
+          std::max(int64{0}, start[i]),
           operand_literal.shape().dimensions(i) - result_shape.dimensions(i));
     }
 
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
index 8ad97fedc9..c120dd8a6c 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_stream.h
@@ -295,7 +295,7 @@ WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
   if (eps <= std::numeric_limits<double>::epsilon()) {
     // Exact quantile computation at the expense of RAM.
     max_level = 1;
-    block_size = std::max(max_elements, 2LL);
+    block_size = std::max(max_elements, int64{2});
   } else {
     // The bottom-most level will become full at most
     // (max_elements / block_size) times, the level above will become full
@@ -315,7 +315,7 @@ WeightedQuantilesStream<ValueType, WeightType, CompareFn>::GetQuantileSpecs(
       block_size = static_cast<size_t>(ceil(max_level / eps)) + 1;
     }
   }
-  return std::make_tuple(max_level, std::max(block_size, 2LL));
+  return std::make_tuple(max_level, std::max(block_size, int64{2}));
 }
 
 }  // namespace quantiles
diff --git a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
index 7576856dc3..a7e7bfc13c 100644
--- a/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
+++ b/tensorflow/contrib/boosted_trees/lib/quantiles/weighted_quantiles_summary.h
@@ -195,7 +195,7 @@ class WeightedQuantilesSummary {
   // designed to be cache-friendly.
   void Compress(int64 size_hint, double min_eps = 0) {
     // No-op if we're already within the size requirement.
-    size_hint = std::max(size_hint, 2LL);
+    size_hint = std::max(size_hint, int64{2});
     if (entries_.size() <= size_hint) {
       return;
     }
@@ -267,7 +267,7 @@ class WeightedQuantilesSummary {
     if (entries_.empty()) {
       return output;
     }
-    num_quantiles = std::max(num_quantiles, 2LL);
+    num_quantiles = std::max(num_quantiles, int64{2});
     output.reserve(num_quantiles + 1);
 
     // Make successive rank queries to get boundaries.
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index cf5d11ec8b..bee5627636 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -770,7 +770,7 @@ int64 MinSystemMemory(int64 available_memory) {
   } else {
     // max(300 MiB, 0.05 * available_memory)
     min_system_memory =
-        std::max(314572800LL, static_cast<int64>(available_memory * 0.05));
+        std::max(int64{314572800}, static_cast<int64>(available_memory * 0.05));
   }
 #if defined(__GNUC__) && defined(__OPTIMIZE__)
 // Do nothing
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index d1b495d2ff..6da0da14f0 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -40,8 +40,8 @@ Status GetWindowedOutputSizeVerboseV2(int64 input_size, int64 filter_size,
     case Padding::SAME:
       *output_size = (input_size + stride - 1) / stride;
       const int64 padding_needed =
-          std::max(0LL, (*output_size - 1) * stride + effective_filter_size -
-                            input_size);
+          std::max(int64{0}, (*output_size - 1) * stride +
+                                 effective_filter_size - input_size);
       // For odd values of total padding, add more padding at the 'right'
       // side of the given dimension.
       *padding_before = padding_needed / 2;
diff --git a/tensorflow/core/kernels/cholesky_grad.cc b/tensorflow/core/kernels/cholesky_grad.cc
index 9d33845c2f..eac66e580d 100644
--- a/tensorflow/core/kernels/cholesky_grad.cc
+++ b/tensorflow/core/kernels/cholesky_grad.cc
@@ -84,7 +84,7 @@ class CholeskyGrad : public LinearAlgebraOp<Scalar> {
       Variables names representing the derivative matrix have a trailing '_bar'.
       */
 
-      const int64 block_begin = std::max(0ll, block_end - kMaxBlockSize);
+      const int64 block_begin = std::max(int64{0}, block_end - kMaxBlockSize);
       const int64 block_size = block_end - block_begin;
       const int64 trailing_size = kMatrixSize - block_end;
 
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index 014684de64..85a9702ae7 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -294,11 +294,11 @@ struct TransformFilterRange {
 
     // Compute number of filter shards.
     const int64 residual_row =
-        std::max(0LL, args.filter_rows - base_filter_rows);
+        std::max(int64{0}, args.filter_rows - base_filter_rows);
     const int64 shard_rows = 1 + (residual_row + 2 - 1) / 2;
 
     const int64 residual_col =
-        std::max(0LL, args.filter_cols - base_filter_cols);
+        std::max(int64{0}, args.filter_cols - base_filter_cols);
     const int64 shard_cols = 1 + (residual_col + 2 - 1) / 2;
 
     // Compute strides to be used for input and output IO.
@@ -415,8 +415,9 @@ struct TransformFilters {
         filter_total_size + filter_transform_buffer_size + filter_out_buf_size;
 
     // Remove fixed cost and divide by per-filter cost.
-    const int64 num_filters_cache = std::max(
-        1LL, (cache_size - filter_transform_matrix_size) / per_filter_cost);
+    const int64 num_filters_cache =
+        std::max(int64{1},
+                 (cache_size - filter_transform_matrix_size) / per_filter_cost);
     const int64 num_filters_transform = std::min(out_depth, num_filters_cache);
 
     // Allocate buffer for filter transform matrix:
@@ -952,11 +953,11 @@ struct DeepConv2D<CPUDevice, T> {
     const int64 base_filter_rows = transform->filter_shape().rows;
 
     const int64 filter_residual_row =
-        std::max(0LL, args.filter_rows - base_filter_rows);
+        std::max(int64{0}, args.filter_rows - base_filter_rows);
     const int64 filter_shards_row = 1 + (filter_residual_row + 2 - 1) / 2;
 
     const int64 filter_residual_col =
-        std::max(0LL, args.filter_cols - base_filter_rows);
+        std::max(int64{0}, args.filter_cols - base_filter_rows);
     const int64 filter_shards_col = 1 + (filter_residual_col + 2 - 1) / 2;
 
     // Allocate buffer for transformed filters.
@@ -1045,8 +1046,8 @@ struct DeepConv2D<CPUDevice, T> {
           buffer1_per_tile_size + buffer2_per_tile_size +
           packed_tile_per_tile_size + gemm_out_per_tile_size;
 
-      const int64 num_tiles_cache =
-          std::max(4LL, (cache_size - total_fixed_cost) / total_per_tile_cost);
+      const int64 num_tiles_cache = std::max(
+          int64{4}, (cache_size - total_fixed_cost) / total_per_tile_cost);
       const int64 num_tiles = std::min(num_tiles_cache, col_tiles);
 
       // Allocate temporary buffer 'buffer1', which is first used for copying
diff --git a/tensorflow/core/kernels/draw_bounding_box_op.cc b/tensorflow/core/kernels/draw_bounding_box_op.cc
index b5d5b880bb..618c47e684 100644
--- a/tensorflow/core/kernels/draw_bounding_box_op.cc
+++ b/tensorflow/core/kernels/draw_bounding_box_op.cc
@@ -93,14 +93,14 @@ class DrawBoundingBoxesOp : public OpKernel {
         int64 color_index = bb % color_table_length;
         const int64 min_box_row =
             static_cast<float>(tboxes(b, bb, 0)) * (height - 1);
-        const int64 min_box_row_clamp = std::max<int64>(min_box_row, 0);
+        const int64 min_box_row_clamp = std::max<int64>(min_box_row, int64{0});
         const int64 max_box_row =
             static_cast<float>(tboxes(b, bb, 2)) * (height - 1);
         const int64 max_box_row_clamp =
             std::min<int64>(max_box_row, height - 1);
         const int64 min_box_col =
             static_cast<float>(tboxes(b, bb, 1)) * (width - 1);
-        const int64 min_box_col_clamp = std::max<int64>(min_box_col, 0);
+        const int64 min_box_col_clamp = std::max<int64>(min_box_col, int64{0});
         const int64 max_box_col =
             static_cast<float>(tboxes(b, bb, 3)) * (width - 1);
         const int64 max_box_col_clamp = std::min<int64>(max_box_col, width - 1);
diff --git a/tensorflow/core/kernels/lrn_op_test.cc b/tensorflow/core/kernels/lrn_op_test.cc
index 9c8a1dfa9a..5d8c5c21ca 100644
--- a/tensorflow/core/kernels/lrn_op_test.cc
+++ b/tensorflow/core/kernels/lrn_op_test.cc
@@ -71,7 +71,7 @@ class LRNFloatTest : public OpsTestBase {
       Eigen::Tensor<float, 1, Eigen::RowMajor> out_col(depth);
       for (int64 d = 0; d < depth; ++d) {
         float denom = 0.0f;
-        for (int64 r = std::max(0ll, d - depth_radius);
+        for (int64 r = std::max(int64{0}, d - depth_radius);
              r < std::min(depth, d + depth_radius + 1); ++r) {
           denom += in(i, r) * in(i, r);
         }
diff --git a/tensorflow/core/kernels/matrix_band_part_op.cc b/tensorflow/core/kernels/matrix_band_part_op.cc
index 1439141f64..61c5277464 100644
--- a/tensorflow/core/kernels/matrix_band_part_op.cc
+++ b/tensorflow/core/kernels/matrix_band_part_op.cc
@@ -159,7 +159,7 @@ struct MatrixBandPartFunctor<CPUDevice, Scalar> {
           const int64 band_start =
               num_lower_diags < 0
                   ? 0
-                  : std::min(n, std::max(0ll, row - num_lower_diags));
+                  : std::min(n, std::max(int64{0}, row - num_lower_diags));
           const int64 band_end =
               num_upper_diags < 0
                   ? n
diff --git a/tensorflow/core/kernels/pooling_ops_common.h b/tensorflow/core/kernels/pooling_ops_common.h
index fc7cb437b8..e9265551e3 100644
--- a/tensorflow/core/kernels/pooling_ops_common.h
+++ b/tensorflow/core/kernels/pooling_ops_common.h
@@ -596,7 +596,7 @@ void SpatialAvgPool(OpKernelContext* context, Tensor* output,
   // so the factor 0.01 (i.e. 1/100) with a max of 10000, was chosen to limit
   // the work unit cost to an operating range in which it emperically performed
   // best.
-  const int64 work_unit_cost = std::max(10000LL, work_unit_size / 100LL);
+  const int64 work_unit_cost = std::max(int64{10000}, work_unit_size / 100LL);
   const DeviceBase::CpuWorkerThreads& worker_threads =
       *(context->device()->tensorflow_cpu_worker_threads());
   Shard(worker_threads.num_threads, worker_threads.workers,
diff --git a/tensorflow/core/kernels/quantization_utils.h b/tensorflow/core/kernels/quantization_utils.h
index 9fafe6bb65..e67a94e5f8 100644
--- a/tensorflow/core/kernels/quantization_utils.h
+++ b/tensorflow/core/kernels/quantization_utils.h
@@ -273,8 +273,8 @@ inline void RequantizeManyInNewRangeReference(const qint32* input, int64 count,
     const int64 offset_intermediate = fp_value - output_offset_fp;
     const int64 round_intermediate = offset_intermediate + rounding_delta;
     int64 quantized_int64 = round_intermediate >> fp_shift;
-    quantized_int64 = std::max(quantized_int64, 0LL);
-    quantized_int64 = std::min(quantized_int64, 255LL);
+    quantized_int64 = std::max(quantized_int64, int64{0});
+    quantized_int64 = std::min(quantized_int64, int64{255});
     output[index] = static_cast<quint8>(static_cast<int32>(quantized_int64));
   }
 }
diff --git a/tensorflow/core/kernels/resize_area_op.cc b/tensorflow/core/kernels/resize_area_op.cc
index 98b8a0df28..c996ae60b7 100644
--- a/tensorflow/core/kernels/resize_area_op.cc
+++ b/tensorflow/core/kernels/resize_area_op.cc
@@ -271,7 +271,7 @@ class ResizeAreaOp : public OpKernel {
 
  private:
   static EIGEN_ALWAYS_INLINE int64 Bound(int64 val, int64 limit) {
-    return std::min(limit - 1ll, std::max(0ll, val));
+    return std::min(limit - 1ll, std::max(int64{0}, val));
   }
 
   bool align_corners_;
diff --git a/tensorflow/core/kernels/resize_bicubic_op.cc b/tensorflow/core/kernels/resize_bicubic_op.cc
index 65014b6c44..8380ed6d8f 100644
--- a/tensorflow/core/kernels/resize_bicubic_op.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op.cc
@@ -57,7 +57,7 @@ const float* GetCoeffsTable() {
 }
 
 inline int64 Bound(int64 val, int64 limit) {
-  return std::min(limit - 1ll, std::max(0ll, val));
+  return std::min(limit - 1ll, std::max(int64{0}, val));
 }
 
 struct WeightsAndIndices {
diff --git a/tensorflow/core/kernels/resize_bicubic_op_test.cc b/tensorflow/core/kernels/resize_bicubic_op_test.cc
index c23570d885..eff25f5ad4 100644
--- a/tensorflow/core/kernels/resize_bicubic_op_test.cc
+++ b/tensorflow/core/kernels/resize_bicubic_op_test.cc
@@ -81,7 +81,7 @@ class ResizeBicubicOpTest : public OpsTestBase {
 
   // Used in the baseline implementation
   inline int64 Bound(int64 val, int64 limit) {
-    return std::min(limit - 1ll, std::max(0ll, val));
+    return std::min(limit - 1ll, std::max(int64{0}, val));
   }
 
   // Used in the baseline implementation
diff --git a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
index d17b72bc26..c9365be511 100644
--- a/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
+++ b/tensorflow/core/kernels/sparse_fill_empty_rows_op.cc
@@ -125,7 +125,7 @@ class SparseFillEmptyRowsOp : public OpKernel {
       // Scratch here describes the number of elements in this dense row
       empty_row_indicator(row) = (scratch(row) == 0);
       // In filled version, each row has at least one element.
-      scratch(row) = std::max(scratch(row), 1LL);
+      scratch(row) = std::max(scratch(row), int64{1});
       // Update scratch to represent the number of elements up to and
       // including dense_row + 1:
       //  scratch(0) == #{elements of row 0}
diff --git a/tensorflow/core/platform/cloud/gcs_throttle.cc b/tensorflow/core/platform/cloud/gcs_throttle.cc
index 27dd06a625..940d98fd09 100644
--- a/tensorflow/core/platform/cloud/gcs_throttle.cc
+++ b/tensorflow/core/platform/cloud/gcs_throttle.cc
@@ -51,7 +51,7 @@ void GcsThrottle::UpdateState() {
   // TODO(b/72643279): Switch to a monotonic clock.
   int64 now = env_time_->NowSeconds();
   uint64 delta_secs =
-      std::max(0LL, now - static_cast<int64>(last_updated_secs_));
+      std::max(int64{0}, now - static_cast<int64>(last_updated_secs_));
   available_tokens_ += delta_secs * config_.token_rate;
   available_tokens_ = std::min(available_tokens_, config_.bucket_size);
   last_updated_secs_ = now;
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc
index 7922fc9224..337af07b50 100644
--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@@ -35,7 +35,7 @@ void Shard(int max_parallelism, thread::ThreadPool* workers, int64 total,
     workers->ParallelFor(total, cost_per_unit, work);
     return;
   }
-  cost_per_unit = std::max(1LL, cost_per_unit);
+  cost_per_unit = std::max(int64{1}, cost_per_unit);
   // We shard [0, total) into "num_shards" shards.
   //   1 <= num_shards <= num worker threads
   //
-- 
GitLab


From 8ff5cba952b47f5a70c6890a52b4cf88a41ad058 Mon Sep 17 00:00:00 2001
From: Rob Sloan <varomodt@google.com>
Date: Wed, 30 May 2018 10:56:02 -0700
Subject: [PATCH 058/610] Add an option to propagate Status in
 GraphOptimizerStagePipelines.

PiperOrigin-RevId: 198585886
---
 .../optimizers/graph_optimizer_stage.h        | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index b0ec967473..2fbdd76a77 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -240,6 +240,25 @@ class GraphOptimizerStagePipeline {
     return false;
   }
 
+  // Pass a node through all registered optimizer stages, until break predicate
+  // is true or a stage fails.
+  //
+  // Returns any stage failure status, or else Status::OK().
+  Status PassThroughAllStagesWithStatus(NodeDef* node, Result* result) {
+    for (auto& stage : stages_) {
+      if (!stage->IsSupported(node)) {
+        continue;
+      }
+      const Status stage_status = stage->TrySimplify(node, result);
+      if (!stage_status.ok()) {
+        return stage_status;
+      } else if (break_predicate_(*result)) {
+        break;
+      }
+    }
+    return Status::OK();
+  }
+
   std::size_t NumStages() { return stages_.size(); }
 
   std::vector<string> StageNames() {
-- 
GitLab


From 12031a70209b06283de7fcdd5a4a3e0887193a57 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 11:12:26 -0700
Subject: [PATCH 059/610] Let the swig wrapped builder to return the
 HloModuleProto.

PiperOrigin-RevId: 198588920
---
 tensorflow/compiler/xla/python/BUILD                 |  2 ++
 .../compiler/xla/python/local_computation_builder.cc |  9 +++++++++
 .../compiler/xla/python/local_computation_builder.h  |  5 +++++
 .../compiler/xla/python/local_computation_builder.i  |  1 +
 tensorflow/compiler/xla/python/xla_client.py         | 12 ++++++++++++
 tensorflow/compiler/xla/python/xla_client_test.py    | 10 ++++++++++
 tensorflow/compiler/xla/service/BUILD                | 10 ++++++++++
 7 files changed, 49 insertions(+)

diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 932cce943f..83834c1ff6 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -12,6 +12,7 @@ py_library(
     deps = [
         ":pywrap_xla",
         "//tensorflow/compiler/xla:xla_data_proto_py",
+        "//tensorflow/compiler/xla/service:hlo_proto_py",
     ],
 )
 
@@ -53,6 +54,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index cb4dc1782b..f808990cad 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -276,6 +276,15 @@ const XlaComputation& LocalComputation::computation() const {
   return computation_;
 }
 
+string LocalComputation::GetSerializedProto() const {
+  string result;
+  if (!computation_.proto().SerializeToString(&result)) {
+    LOG(ERROR) << "Failed to serialize the HloModuleProto.";
+    return "";
+  }
+  return result;
+}
+
 StatusOr<Shape> LocalComputation::GetReturnValueShape() const {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation_.GetProgramShape());
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index a06b85b4ea..9ac13b6523 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -112,6 +112,11 @@ class LocalComputation {
 
   const XlaComputation& computation() const;
 
+  // Returns the HloModuleProto contained in the XlaComputation in the
+  // serialized binary format. Logs an internal error and returns an empty
+  // string on failure.
+  string GetSerializedProto() const;
+
   // Returns the return-value shape for this computation.
   StatusOr<Shape> GetReturnValueShape() const;
 
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 04c56bbba9..51412ca474 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -906,6 +906,7 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputation;
 %unignore xla::swig::LocalComputation::Compile;
 %unignore xla::swig::LocalComputation::GetReturnValueShape;
+%unignore xla::swig::LocalComputation::GetSerializedProto;
 %unignore xla::swig::LocalOp;
 %unignore xla::swig::LocalComputationBuilder;
 %unignore xla::swig::LocalComputationBuilder::LocalComputationBuilder;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 1d5b75d1be..50b548afa5 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -28,6 +28,7 @@ import numpy as np
 
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
+from tensorflow.compiler.xla.service import hlo_pb2
 
 
 # Most functions are snake_case for consistency with other modules, whereas
@@ -410,6 +411,17 @@ class LocalComputation(object):
       assert isinstance(c_local_computation, c_api.LocalComputation)
       self._delete = c_api.DeleteLocalComputation
 
+  def GetProto(self):
+    """Get the HloModuleProto proto object in this local computation.
+
+    Returns:
+       An HloModuleProto proto object that has the whole-graph information.
+    """
+
+    serialized = self.c_local_computation.GetSerializedProto()
+    proto = hlo_pb2.HloModuleProto.FromString(serialized)
+    return proto
+
   def Compile(self, argument_shapes=(), compile_options=None, layout_fn=None):
     """Compiles an un-compiled local computation.
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index c073c02040..e3d393bccc 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -164,6 +164,16 @@ class ComputationsWithConstantsTest(LocalComputationTest):
         c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
     self._ExecuteAndCompareClose(c, expected=[[2, 1, 4], [3, 6, 5]])
 
+  def testGetProto(self):
+    c = self._NewComputation()
+    c.Add(
+        c.Constant(NumpyArrayF32([[1, 2, 3], [4, 5, 6]])),
+        c.Constant(NumpyArrayF32([[1, -1, 1], [-1, 1, -1]])))
+    built = c.Build()
+    proto = built.GetProto()  # HloModuleProto
+    self.assertTrue(len(proto.computations) == 1)
+    self.assertTrue(len(proto.computations[0].instructions) == 3)
+
   def testSum2DF64(self):
     c = self._NewComputation()
     c.Add(
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 7e4a75a6e3..4d653a0196 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -16,6 +16,10 @@ load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load(
+    "//tensorflow/core:platform/default/build_config.bzl",
+    "tf_proto_library_py",
+)
 
 xla_proto_library(
     name = "session_proto",
@@ -31,6 +35,12 @@ xla_proto_library(
     deps = ["//tensorflow/compiler/xla:xla_data_proto"],
 )
 
+tf_proto_library_py(
+    name = "hlo_proto",  # bzl adds a _py suffix only to the OSS target.
+    srcs = ["hlo.proto"],
+    visibility = ["//visibility:public"],
+)
+
 xla_proto_library(
     name = "hlo_profile_printer_data",
     srcs = ["hlo_profile_printer_data.proto"],
-- 
GitLab


From c6639c3591dedb9441c9cebb28ae544d22d0e44c Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Wed, 30 May 2018 11:30:23 -0700
Subject: [PATCH 060/610] [tf.data] change batch dataset op test size to large
 to prevent timeout

PiperOrigin-RevId: 198592202
---
 tensorflow/contrib/data/python/kernel_tests/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c483a43769..285c77dea9 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -8,7 +8,7 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
-    size = "medium",
+    size = "large",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
-- 
GitLab


From 1bfdff68c26a0881a951e6455847f0bafe94cc53 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 11:48:43 -0700
Subject: [PATCH 061/610] Skip errors in function optimizer if optimized graph
 was not modified before error happened.

Currently error can happen if function can't be instantiated as GrapplerFunctionItem.

PiperOrigin-RevId: 198595096
---
 .../grappler/optimizers/function_optimizer.cc | 44 +++++++++--
 .../optimizers/function_optimizer_test.cc     | 76 +++++++++++++++++++
 2 files changed, 114 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index fa228c68a1..b0d689c2dd 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -662,7 +662,7 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
 
 Status InlineSymbolicGradient(const NodeDef& node,
                               FunctionOptimizerContext* ctx,
-                              GraphDef* inlined_graph) {
+                              GraphDef* optimized_graph) {
   VLOG(2) << "Inline symbolic gradient: " << SummarizeNodeDef(node);
 
   GraphDef graph_def;
@@ -750,7 +750,7 @@ Status InlineSymbolicGradient(const NodeDef& node,
       }
     }
     inlined_node.set_device(node.device());
-    inlined_graph->add_node()->Swap(&inlined_node);
+    optimized_graph->add_node()->Swap(&inlined_node);
   }
 
   return Status::OK();
@@ -778,32 +778,62 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   for (const NodeDef& node : item.graph.node()) {
     const string func_name = node.op();
 
+    // Each node optimization can modify optimized graph only by adding new
+    // nodes, we can check node size to make sure that graph was not modified.
+    const int num_nodes_before = optimized_graph->node_size();
+    const auto is_graph_modified = [&]() {
+      int num_nodes = optimized_graph->node_size();
+      CHECK_GE(num_nodes, num_nodes_before) << "Nodes should not be removed";
+      return num_nodes > num_nodes_before;
+    };
+
+    // Add a copy of an input graph node to the optimized graph.
+    const auto add_node_copy = [&]() { *optimized_graph->add_node() = node; };
+
+// Skip errors if optimized graph was not modified before error happened.
+#define TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(...)                     \
+  do {                                                             \
+    const Status _status = (__VA_ARGS__);                          \
+    if (TF_PREDICT_FALSE(!_status.ok() && is_graph_modified()))    \
+      return _status;                                              \
+    if (TF_PREDICT_FALSE(!_status.ok() && !is_graph_modified())) { \
+      VLOG(3) << "Skip error: " << _status.error_message();        \
+      add_node_copy();                                             \
+    }                                                              \
+  } while (0)
+
+    // 1. Inline symbolic gradients into the optimized graph.
     if (func_name == "SymbolicGradient" && inline_gradients) {
       // Inline symbolic gradients only if the corresponding function is inlined
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
       string f_name = f_attr != nullptr ? f_attr->func().name() : "";
       if (ctx.IsInlinedFunction(f_name)) {
-        TF_RETURN_IF_ERROR(InlineSymbolicGradient(node, &ctx, optimized_graph));
+        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+            InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
       }
     }
 
+    // 2. Check if a node op is a function call.
     const FunctionDef* func = ctx.function_library().Find(func_name);
     if (func != nullptr) {
+      // 2a. Inline it if it's allowed to do so.
       if (inline_func && ctx.IsInlinedFunction(func_name)) {
         // Inline function body into the optimized graph}
-        TF_RETURN_IF_ERROR(InlineFunction(node, *func, ctx, optimized_graph));
+        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
+            InlineFunction(node, *func, ctx, optimized_graph));
         continue;
       }
 
       // Do not specialize if function has custom gradient.
       const string grad_func = ctx.function_library().FindGradient(func_name);
 
+      // 2b. Specialize it to it's instantiation context if can't be inlined.
       if (specialize_func && grad_func.empty() &&
           (IsParametrized(*func) || HasTrulyConstInputs(node, ctx))) {
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
-        TF_RETURN_IF_ERROR(
+        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             SpecializeFunction(node, *func, &ctx, optimized_graph));
         continue;
       }
@@ -811,7 +841,9 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // If we reached this point, node was not handled by any of the stages
     // (inline, specialize), simply add a copy to the graph.
-    *optimized_graph->add_node() = node;
+    add_node_copy();
+
+#undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
   *optimized_graph->mutable_versions() = item.graph.versions();
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 0aaf57e947..d043f6129d 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -111,6 +111,82 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
 
+TEST_F(FunctionOptimizerTest, InlineFunction_SkipErrorsIfGraphNotModified) {
+  using test::function::NDef;
+
+  FunctionOptimizer optimizer(RewriterConfig::DEFAULT);
+
+  // Standard XTimesTwo() function.
+  FunctionDef x_times_two = test::function::XTimesTwo();
+
+  // Function with sequence of tensors as an input (currently not supported).
+  FunctionDef my_identity_n = FunctionDefHelper::Create(
+      // Name
+      "MyIdentityN",
+      // Args
+      {"x: N*T"},
+      // Return values
+      {"out: N*T"},
+      // Attrs
+      {"N:int", "T:{float, double, int32, int64}"},
+      // Nodes (just forward inputs through IdentityN)
+      {
+          {{"Id"}, "IdentityN", {"x"}, {{"T", "$T"}, {"N", "$N"}}},
+      },
+      // Output mapping
+      {{"out", "Id:output:0"}});
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("x", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("y1", "XTimesTwo", {"x"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("y2", "MyIdentityN", {"x"}, {{"T", DT_FLOAT}, {"N", 1}}, kDevice),
+       NDef("z1", "Identity", {"y1:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("z2", "Identity", {"y2:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // FunctionLib
+      {x_times_two, my_identity_n});
+
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  // Verify that only MyIdentityN is in the function library after optimization.
+  ASSERT_EQ(1, output.library().function().size());
+  EXPECT_EQ("MyIdentityN", output.library().function(0).signature().name());
+
+  // And that XTimesTwo was successfully inlined.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "y1/inlined_inputs") {
+      found++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    } else if (node.name() == "y1") {
+      found++;
+      EXPECT_EQ("IdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("y1/y", node.input(0));
+    } else if (node.name() == "y2") {
+      found++;
+      EXPECT_EQ("MyIdentityN", node.op());
+      EXPECT_EQ(kDevice, node.device());
+      EXPECT_EQ(1, node.input_size());
+      EXPECT_EQ("x", node.input(0));
+    }
+  }
+  EXPECT_EQ(3, found);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.fetch = {"z1"};
+  item.feed.emplace_back("x", pi);
+  auto tensors_expected = EvaluateFetchNodes(item);
+  GrapplerItem optimized(item, std::move(output));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
   using test::function::NDef;
 
-- 
GitLab


From 898e646d0291d753e5092ff5e9c6ff70f5064c19 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 30 May 2018 13:43:55 -0700
Subject: [PATCH 062/610] Import only ops not the implementations to prevent
 issues if user don't have tensorrt installed

---
 tensorflow/python/tools/import_pb_to_tensorboard.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 mode change 100755 => 100644 tensorflow/python/tools/import_pb_to_tensorboard.py

diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
old mode 100755
new mode 100644
index d1f9cd87b3..96f47c85da
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -30,12 +30,12 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.summary import summary
 
 # Try importing TensorRT ops if available
-# pylint: disable=unused-import,trailing-whitespace
+# pylint: disable=unused-import,trailing-whitespace,g-import-not-at-top,wildcard-import
 try:
-  import tensorflow.contrib.tensorrt as trt 
+  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
 except ImportError:
   pass
-# pylint: enable=unused-import,trailing-whitespace
+# pylint: enable=unused-import,trailing-whitespace,g-import-not-at-top,wildcard-import
 
 def import_to_tensorboard(model_dir, log_dir):
   """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
-- 
GitLab


From 144c2b4a5fadb6cfed371dc9d72119826dbaf418 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 14:33:54 -0700
Subject: [PATCH 063/610] Add include file which provides the proper
 std::string mapping.

PiperOrigin-RevId: 198620715
---
 tensorflow/compiler/xla/service/hlo_domain_metadata.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/compiler/xla/service/hlo_domain_metadata.h b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
index 9853bd39cd..aa0308100a 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_domain_metadata.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/flatset.h"
-- 
GitLab


From 1e0d7ecb4b88a74bc45056f8eef5b1560eaab41a Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 30 May 2018 14:41:31 -0700
Subject: [PATCH 064/610] Remove changes to tensorboard script

---
 tensorflow/python/tools/import_pb_to_tensorboard.py | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/tensorflow/python/tools/import_pb_to_tensorboard.py b/tensorflow/python/tools/import_pb_to_tensorboard.py
index 96f47c85da..00de044505 100644
--- a/tensorflow/python/tools/import_pb_to_tensorboard.py
+++ b/tensorflow/python/tools/import_pb_to_tensorboard.py
@@ -29,13 +29,6 @@ from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary import summary
 
-# Try importing TensorRT ops if available
-# pylint: disable=unused-import,trailing-whitespace,g-import-not-at-top,wildcard-import
-try:
-  from tensorflow.contrib.tensorrt.ops.gen_trt_engine_op import *
-except ImportError:
-  pass
-# pylint: enable=unused-import,trailing-whitespace,g-import-not-at-top,wildcard-import
 
 def import_to_tensorboard(model_dir, log_dir):
   """View an imported protobuf model (`.pb` file) as a graph in Tensorboard.
-- 
GitLab


From 5810723cc8f25fcf651be56c5b0271f70011fc2d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 14:44:57 -0700
Subject: [PATCH 065/610] Add
 `tf.contrib.distributions.bijectors.MatrixInverseTriL`: Bijector that inverts
 a lower-triangular matrix.

PiperOrigin-RevId: 198622553
---
 tensorflow/contrib/distributions/BUILD        |  19 ++
 .../bijectors/matrix_inverse_tril_test.py     | 190 ++++++++++++++++++
 .../python/ops/bijectors/__init__.py          |   2 +
 .../ops/bijectors/matrix_inverse_tril.py      | 145 +++++++++++++
 4 files changed, 356 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 6192f04c8b..23d9dbcd91 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -1032,6 +1032,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "matrix_inverse_tril_test",
+    size = "medium",
+    srcs = ["python/kernel_tests/bijectors/matrix_inverse_tril_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "real_nvp_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
new file mode 100644
index 0000000000..1839703557
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/matrix_inverse_tril_test.py
@@ -0,0 +1,190 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for MatrixInverseTriL bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class MatrixInverseTriLBijectorTest(test.TestCase):
+  """Tests the correctness of the Y = inv(tril) transformation."""
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testComputesCorrectValues(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    self.assertEqual("matrix_inverse_tril", inv.name)
+    x_ = np.array([[0.7, 0., 0.],
+                   [0.1, -1., 0.],
+                   [0.3, 0.25, 0.5]], dtype=np.float32)
+    x_inv_ = np.linalg.inv(x_)
+    expected_fldj_ = -6. * np.sum(np.log(np.abs(np.diag(x_))))
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertNear(expected_fldj_, fldj_, err=1e-3)
+    self.assertNear(-expected_fldj_, ildj_, err=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testOneByOneMatrix(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[5.]], dtype=np.float32)
+    x_inv_ = np.array([[0.2]], dtype=np.float32)
+    expected_fldj_ = np.log(0.04)
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertNear(expected_fldj_, fldj_, err=1e-3)
+    self.assertNear(-expected_fldj_, ildj_, err=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testZeroByZeroMatrix(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.eye(0, dtype=np.float32)
+    x_inv_ = np.eye(0, dtype=np.float32)
+    expected_fldj_ = 0.
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertNear(expected_fldj_, fldj_, err=1e-3)
+    self.assertNear(-expected_fldj_, ildj_, err=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBatch(self):
+    # Test batch computation with input shape (2, 1, 2, 2), i.e. batch shape
+    # (2, 1).
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[[[1., 0.],
+                     [2., 3.]]],
+                   [[[4., 0.],
+                     [5., -6.]]]], dtype=np.float32)
+    x_inv_ = np.linalg.inv(x_)
+    expected_fldj_ = -4. * np.sum(
+        np.log(np.abs(np.diagonal(x_, axis1=-2, axis2=-1))), axis=-1)
+
+    y = inv.forward(x_)
+    x_back = inv.inverse(x_inv_)
+    fldj = inv.forward_log_det_jacobian(x_, event_ndims=2)
+    ildj = inv.inverse_log_det_jacobian(x_inv_, event_ndims=2)
+
+    y_, x_back_, fldj_, ildj_ = self.evaluate([y, x_back, fldj, ildj])
+
+    self.assertAllClose(x_inv_, y_, atol=0., rtol=1e-5)
+    self.assertAllClose(x_, x_back_, atol=0., rtol=1e-5)
+    self.assertAllClose(expected_fldj_, fldj_, atol=0., rtol=1e-3)
+    self.assertAllClose(-expected_fldj_, ildj_, atol=0., rtol=1e-3)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testErrorOnInputRankTooLow(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([0.1], dtype=np.float32)
+    rank_error_msg = "must have rank at least 2"
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.forward(x_).eval()
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.inverse(x_).eval()
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+      with self.assertRaisesWithPredicateMatch(ValueError, rank_error_msg):
+        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+  # TODO(b/80481923): Figure out why these assertions fail, and fix them.
+  ## def testErrorOnInputNonSquare(self):
+  ##   inv = bijectors.MatrixInverseTriL(validate_args=True)
+  ##   x_ = np.array([[1., 2., 3.],
+  ##                  [4., 5., 6.]], dtype=np.float32)
+  ##   square_error_msg = "must be a square matrix"
+  ##   with self.test_session():
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.forward(x_).eval()
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.inverse(x_).eval()
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+  ##     with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+  ##                                              square_error_msg):
+  ##       inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testErrorOnInputNotLowerTriangular(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[1., 2.],
+                   [3., 4.]], dtype=np.float32)
+    triangular_error_msg = "must be lower triangular"
+    with self.test_session():
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.forward(x_).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.inverse(x_).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+      with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
+                                               triangular_error_msg):
+        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testErrorOnInputSingular(self):
+    inv = bijectors.MatrixInverseTriL(validate_args=True)
+    x_ = np.array([[1., 0.],
+                   [0., 0.]], dtype=np.float32)
+    nonsingular_error_msg = "must have all diagonal entries nonzero"
+    with self.test_session():
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.forward(x_).eval()
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.inverse(x_).eval()
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.forward_log_det_jacobian(x_, event_ndims=2).eval()
+      with self.assertRaisesOpError(nonsingular_error_msg):
+        inv.inverse_log_det_jacobian(x_, event_ndims=2).eval()
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 51478dbeff..4965381ef3 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -30,6 +30,7 @@
 @@Invert
 @@Kumaraswamy
 @@MaskedAutoregressiveFlow
+@@MatrixInverseTriL
 @@Ordered
 @@Permute
 @@PowerTransform
@@ -68,6 +69,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
 from tensorflow.contrib.distributions.python.ops.bijectors.kumaraswamy import *
 from tensorflow.contrib.distributions.python.ops.bijectors.masked_autoregressive import *
+from tensorflow.contrib.distributions.python.ops.bijectors.matrix_inverse_tril import *
 from tensorflow.contrib.distributions.python.ops.bijectors.ordered import *
 from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py b/tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py
new file mode 100644
index 0000000000..71903f7052
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/matrix_inverse_tril.py
@@ -0,0 +1,145 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""MatrixInverseTriL bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+
+
+__all__ = [
+    "MatrixInverseTriL",
+]
+
+
+class MatrixInverseTriL(bijector.Bijector):
+  """Computes `g(L) = inv(L)`, where `L` is a lower-triangular matrix.
+
+  `L` must be nonsingular; equivalently, all diagonal entries of `L` must be
+  nonzero.
+
+  The input must have `rank >= 2`.  The input is treated as a batch of matrices
+  with batch shape `input.shape[:-2]`, where each matrix has dimensions
+  `input.shape[-2]` by `input.shape[-1]` (hence `input.shape[-2]` must equal
+  `input.shape[-1]`).
+
+  #### Examples
+
+  ```python
+  tfd.bijectors.MatrixInverseTriL().forward(x=[[1., 0], [2, 1]])
+  # Result: [[1., 0], [-2, 1]], i.e., inv(x)
+
+  tfd.bijectors.MatrixInverseTriL().inverse(y=[[1., 0], [-2, 1]])
+  # Result: [[1., 0], [2, 1]], i.e., inv(y).
+  ```
+
+  """
+
+  def __init__(self, validate_args=False, name="matrix_inverse_tril"):
+    """Instantiates the `MatrixInverseTriL` bijector.
+
+    Args:
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._graph_parents = []
+    self._name = name
+    super(MatrixInverseTriL, self).__init__(
+        forward_min_event_ndims=2,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    with ops.control_dependencies(self._assertions(x)):
+      shape = array_ops.shape(x)
+      return linalg_ops.matrix_triangular_solve(
+          x, linalg_ops.eye(shape[-1], batch_shape=shape[:-2]), lower=True)
+
+  def _inverse(self, y):
+    return self._forward(y)
+
+  def _forward_log_det_jacobian(self, x):
+    # Calculation of the Jacobian:
+    #
+    # Let X = (x_{ij}), 0 <= i,j < n, be a matrix of indeterminates.  Let Z =
+    # X^{-1} where Z = (z_{ij}).  Then
+    #
+    #     dZ/dx_{ij} = (d/dt | t=0) Y(t)^{-1},
+    #
+    # where Y(t) = X + t*E_{ij} and E_{ij} is the matrix with a 1 in the (i,j)
+    # entry and zeros elsewhere.  By the product rule,
+    #
+    #     0 = d/dt [Identity matrix]
+    #       = d/dt [Y Y^{-1}]
+    #       = Y d/dt[Y^{-1}] + dY/dt Y^{-1}
+    #
+    # so
+    #
+    #     d/dt[Y^{-1}] = -Y^{-1} dY/dt Y^{-1}
+    #                  = -Y^{-1} E_{ij} Y^{-1}.
+    #
+    # Evaluating at t=0,
+    #
+    #     dZ/dx_{ij} = -Z E_{ij} Z.
+    #
+    # Taking the (r,s) entry of each side,
+    #
+    #     dz_{rs}/dx_{ij} = -z_{ri}z_{sj}.
+    #
+    # Now, let J be the Jacobian dZ/dX, arranged as the n^2-by-n^2 matrix whose
+    # (r*n + s, i*n + j) entry is dz_{rs}/dx_{ij}.  Considering J as an n-by-n
+    # block matrix with n-by-n blocks, the above expression for dz_{rs}/dx_{ij}
+    # shows that the block at position (r,i) is -z_{ri}Z.  Hence
+    #
+    #          J = -KroneckerProduct(Z, Z),
+    #     det(J) = (-1)^(n^2) (det Z)^(2n)
+    #            = (-1)^n (det X)^(-2n).
+    with ops.control_dependencies(self._assertions(x)):
+      return (-2. * math_ops.cast(array_ops.shape(x)[-1], x.dtype.base_dtype) *
+              math_ops.reduce_sum(
+                  math_ops.log(math_ops.abs(array_ops.matrix_diag_part(x))),
+                  axis=-1))
+
+  def _assertions(self, x):
+    if not self.validate_args:
+      return []
+    shape = array_ops.shape(x)
+    is_matrix = check_ops.assert_rank_at_least(
+        x, 2, message="Input must have rank at least 2.")
+    is_square = check_ops.assert_equal(
+        shape[-2], shape[-1], message="Input must be a square matrix.")
+    above_diagonal = array_ops.matrix_band_part(
+        array_ops.matrix_set_diag(
+            x, array_ops.zeros(shape[:-1], dtype=dtypes.float32)),
+        0, -1)
+    is_lower_triangular = check_ops.assert_equal(
+        above_diagonal, array_ops.zeros_like(above_diagonal),
+        message="Input must be lower triangular.")
+    # A lower triangular matrix is nonsingular iff all its diagonal entries are
+    # nonzero.
+    diag_part = array_ops.matrix_diag_part(x)
+    is_nonsingular = check_ops.assert_none_equal(
+        diag_part, array_ops.zeros_like(diag_part),
+        message="Input must have all diagonal entries nonzero.")
+    return [is_matrix, is_square, is_lower_triangular, is_nonsingular]
-- 
GitLab


From 5c751fe8d766d4875cc99d58a536a29652685e26 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Wed, 30 May 2018 14:45:56 -0700
Subject: [PATCH 066/610] Add control dependencies to the correct graph when
 simplifying packing ops.

PiperOrigin-RevId: 198622727
---
 tensorflow/contrib/metrics/python/ops/metric_ops_test.py | 8 ++++++++
 tensorflow/core/grappler/optimizers/constant_folding.cc  | 2 +-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 76420db8bd..e6f75fcbd7 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -7101,6 +7101,14 @@ class CohenKappaTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.cohen_kappa(labels, invalid_predictions, 3)
 
+  def testConditionalPackingOptimization(self):
+    placeholder = array_ops.placeholder(dtypes_lib.float32, [None])
+    values, update_op = metric_ops.streaming_concat(placeholder)
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      for feed in range(10):
+        sess.run(update_op, feed_dict={placeholder: [feed]})
+        print(sess.run(values))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 1ea916a250..7f0c2a2116 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -2171,7 +2171,7 @@ bool ConstantFolding::SimplifyPack(GraphDef* optimized_graph, NodeDef* node) {
     }
     // Add a control dependency to make sure axis_node is in the right frame.
     const string ctrl_dep = ConstantFolding::AddControlDependency(
-        node->input(0), graph_, node_map_.get());
+        node->input(0), optimized_graph, node_map_.get());
     axis_node->add_input(ctrl_dep);
     axis_node->set_device(node->device());
     node->set_op("ExpandDims");
-- 
GitLab


From 176754d6cce54a971c98096f55251870708eea3e Mon Sep 17 00:00:00 2001
From: "Joshua V. Dillon" <jvdillon@google.com>
Date: Wed, 30 May 2018 14:52:57 -0700
Subject: [PATCH 067/610] Add `fill_triangular_inverse`, which flattens a
 triangular matrix in a way such that: # Lower triangular matrix x =
 tf.matrix_band_part(x, -1, 0) x ==
 fill_triangular(fill_triangular_inverse(x)) Code by srvasude@ which I'm
 submitting on his behalf.

PiperOrigin-RevId: 198623887
---
 tensorflow/contrib/distributions/__init__.py  |  2 +
 .../kernel_tests/distributions/util_test.py   | 24 ++++++
 tensorflow/python/ops/distributions/util.py   | 74 ++++++++++++++++++-
 3 files changed, 97 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index ddf59891e6..802538ba97 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -32,6 +32,7 @@ from tensorflow.contrib.distributions.python.ops.conditional_distribution import
 from tensorflow.contrib.distributions.python.ops.conditional_transformed_distribution import *
 from tensorflow.contrib.distributions.python.ops.deterministic import *
 from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular
+from tensorflow.contrib.distributions.python.ops.distribution_util import fill_triangular_inverse
 from tensorflow.contrib.distributions.python.ops.distribution_util import matrix_diag_transform
 from tensorflow.contrib.distributions.python.ops.distribution_util import reduce_weighted_logsumexp
 from tensorflow.contrib.distributions.python.ops.distribution_util import softplus_inverse
@@ -156,6 +157,7 @@ _allowed_symbols = [
     'kl_divergence',
     'RegisterKL',
     'fill_triangular',
+    'fill_triangular_inverse',
     'matrix_diag_transform',
     'reduce_weighted_logsumexp',
     'softplus_inverse',
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index 63d19c15cf..2f256d3e8b 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -814,6 +814,30 @@ class FillTriangularTest(test.TestCase):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
 
+class FillTriangularInverseTest(FillTriangularTest):
+
+  def _run_test(self, x_, use_deferred_shape=False, **kwargs):
+    x_ = np.asarray(x_)
+    with self.test_session() as sess:
+      static_shape = None if use_deferred_shape else x_.shape
+      x_pl = array_ops.placeholder_with_default(x_, shape=static_shape)
+      zeros_like_x_pl = (x_pl * array_ops.stop_gradient(x_pl - 1.)
+                         - array_ops.stop_gradient(x_pl * (x_pl - 1.)))
+      x = x_pl + zeros_like_x_pl
+      actual = du.fill_triangular(x, **kwargs)
+      inverse_actual = du.fill_triangular_inverse(actual, **kwargs)
+
+      inverse_actual_ = sess.run(
+          inverse_actual,
+          feed_dict={x_pl: x_})
+
+    if use_deferred_shape:
+      self.assertEqual(None, inverse_actual.shape)
+    else:
+      self.assertAllEqual(x_.shape, inverse_actual.shape)
+    self.assertAllEqual(x_, inverse_actual_)
+
+
 class ReduceWeightedLogSumExp(test.TestCase):
 
   def _reduce_weighted_logsumexp(self, logx, w, axis, keep_dims=False):
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 1b2c8762a4..401676bf84 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -824,8 +824,8 @@ def fill_triangular(x, upper=False, name=None):
   Triangular matrix elements are filled in a clockwise spiral. See example,
   below.
 
-  If `x.get_shape()` is `[b1, b2, ..., bK, d]` then the output shape is `[b1,
-  b2, ..., bK, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
+  If `x.get_shape()` is `[b1, b2, ..., bB, d]` then the output shape is
+  `[b1, b2, ..., bB, n, n]` where `n` is such that `d = n(n+1)/2`, i.e.,
   `n = int(np.sqrt(0.25 + 2. * m) - 0.5)`.
 
   Example:
@@ -914,7 +914,7 @@ def fill_triangular(x, upper=False, name=None):
     #   = 2 (n**2 / 2 + n / 2) - n**2
     #   = n**2 + n - n**2
     #   = n
-    ndims = array_ops.rank(x) if x.shape.ndims is None else x.shape.ndims
+    ndims = prefer_static_rank(x)
     if upper:
       x_list = [x, array_ops.reverse(x[..., n:], axis=[ndims - 1])]
     else:
@@ -932,6 +932,74 @@ def fill_triangular(x, upper=False, name=None):
     return x
 
 
+def fill_triangular_inverse(x, upper=False, name=None):
+  """Creates a vector from a (batch of) triangular matrix.
+
+  The vector is created from the lower-triangular or upper-triangular portion
+  depending on the value of the parameter `upper`.
+
+  If `x.shape` is `[b1, b2, ..., bB, n, n]` then the output shape is
+  `[b1, b2, ..., bB, d]` where `d = n (n + 1) / 2`.
+
+  Example:
+
+  ```python
+  fill_triangular_inverse(
+    [[4, 0, 0],
+     [6, 5, 0],
+     [3, 2, 1]])
+
+  # ==> [1, 2, 3, 4, 5, 6]
+
+  fill_triangular_inverse(
+    [[1, 2, 3],
+     [0, 5, 6],
+     [0, 0, 4]], upper=True)
+
+  # ==> [1, 2, 3, 4, 5, 6]
+  ```
+
+  Args:
+    x: `Tensor` representing lower (or upper) triangular elements.
+    upper: Python `bool` representing whether output matrix should be upper
+      triangular (`True`) or lower triangular (`False`, default).
+    name: Python `str`. The name to give this op.
+
+  Returns:
+    flat_tril: (Batch of) vector-shaped `Tensor` representing vectorized lower
+      (or upper) triangular elements from `x`.
+  """
+
+  with ops.name_scope(name, "fill_triangular_inverse", values=[x]):
+    x = ops.convert_to_tensor(x, name="x")
+    if x.shape.with_rank_at_least(2)[-1].value is not None:
+      n = np.int32(x.shape[-1].value)
+      m = np.int32((n * (n + 1)) // 2)
+      static_final_shape = x.shape[:-2].concatenate([m])
+    else:
+      n = array_ops.shape(x)[-1]
+      m = (n * (n + 1)) // 2
+      static_final_shape = x.shape.with_rank_at_least(2)[:-2].concatenate(
+          [None])
+    ndims = prefer_static_rank(x)
+    if upper:
+      initial_elements = x[..., 0, :]
+      triangular_portion = x[..., 1:, :]
+    else:
+      initial_elements = array_ops.reverse(x[..., -1, :], axis=[ndims - 2])
+      triangular_portion = x[..., :-1, :]
+    rotated_triangular_portion = array_ops.reverse(
+        array_ops.reverse(triangular_portion, axis=[ndims - 1]),
+        axis=[ndims - 2])
+    consolidated_matrix = triangular_portion + rotated_triangular_portion
+    end_sequence = array_ops.reshape(
+        consolidated_matrix,
+        array_ops.concat([array_ops.shape(x)[:-2], [n * (n - 1)]], axis=0))
+    y = array_ops.concat([initial_elements, end_sequence[..., :m - n]], axis=-1)
+    y.set_shape(static_final_shape)
+    return y
+
+
 def tridiag(below=None, diag=None, above=None, name=None):
   """Creates a matrix with values set above, below, and on the diagonal.
 
-- 
GitLab


From ecd9bce7fb411db7304c98a2a324ebe6fbe630e9 Mon Sep 17 00:00:00 2001
From: Sami Kama <skama@nvidia.com>
Date: Wed, 30 May 2018 15:08:34 -0700
Subject: [PATCH 068/610] Review changes

---
 tensorflow/contrib/tensorrt/convert/convert_graph.cc | 8 ++++++--
 tensorflow/contrib/tensorrt/convert/convert_nodes.cc | 6 +++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 5f79f6d108..da4dd5a14c 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -186,7 +186,10 @@ struct ConvertGraphParams {
 static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphIncomingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_incoming_edges);
+
   std::set<std::pair<int, int>> unique_tensors;
+  // Add only unique input source nodes. If output of an outside node is shared
+  // between multiple nodes inside the engine, only one edge should be created
   for (const tensorflow::Edge* edge : p->subgraph_incoming_edges) {
     unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
@@ -195,6 +198,9 @@ static tensorflow::Status FillSubGraphEdgeSets(ConvertGraphParams* p) {
   GetSubGraphOutgoingEdges(p->graph, p->subgraph_node_ids,
                            &p->subgraph_outgoing_edges);
   unique_tensors.clear();
+  // Similar to above, if multiple ouside nodes are sharing the output of an
+  // internal node only one output port should be created and shared between
+  // outputs
   for (const tensorflow::Edge* edge : p->subgraph_outgoing_edges) {
     unique_tensors.insert({edge->src()->id(), edge->src_output()});
   }
@@ -222,7 +228,6 @@ tensorflow::Status GetCalibNode(ConvertGraphParams* params) {
   for (auto in_edge :
        params->subgraph_incoming_edges) {  // loop over incoming edges and
                                            // attach them to calib node
-    // tensorflow::Node* src_node = in_edge->src();
     auto src_output = in_edge->src_output();
     auto dst_node = in_edge->dst();
     auto dst_input = in_edge->dst_input();
@@ -280,7 +285,6 @@ tensorflow::Status ConvertSubGraphToTensorRT(ConvertGraphParams* params) {
     subgraph_edge_to_output_map.insert({params->subgraph_outputs.at(i), i});
   }
   TF_RETURN_IF_ERROR(status);
-  unique_tensors.clear();
   for (const tensorflow::Edge* edge : params->subgraph_outgoing_edges) {
     std::pair<int, int> old_src = {edge->src()->id(), edge->src_output()};
     int new_src_output = subgraph_edge_to_output_map.at(old_src);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index 4026ad75fa..21e60923f8 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -2176,7 +2176,7 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
     VLOG(2) << node_names;
   }
 
-  VLOG(0) << "Output Nodes:";
+  VLOG(1) << "Output Nodes:";
   std::vector<tensorflow::DataType> out_types;
   std::vector<const tensorflow::Edge*> out_edges;
 
@@ -2298,11 +2298,11 @@ tensorflow::Status ConvertCalibrationNodeToEngineNode(
         graph.UpdateEdge(trt_engine_node, out_port, i->dst(), i->dst_input()));
   }
   for (const auto ed : trt_engine_node->in_edges()) {
-    VLOG(0) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
+    VLOG(1) << "In Edge  " << ed->src()->name() << ":" << ed->src_output()
             << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   for (const auto ed : trt_engine_node->out_edges()) {
-    VLOG(0) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
+    VLOG(1) << "Out Edge " << ed->src()->name() << ":" << ed->src_output()
             << " -> " << ed->dst()->name() << ":" << ed->dst_input();
   }
   VLOG(1) << "Segment nodes:";
-- 
GitLab


From e469934f1274c7c498e5061995fec425a21c9be8 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Wed, 30 May 2018 15:25:46 -0700
Subject: [PATCH 069/610] Add GCS configure ops.

PiperOrigin-RevId: 198624285
---
 tensorflow/contrib/cloud/BUILD                |  15 +-
 tensorflow/contrib/cloud/__init__.py          |   8 +-
 tensorflow/contrib/cloud/kernels/BUILD        |  14 ++
 .../contrib/cloud/kernels/gcs_config_ops.cc   | 203 ++++++++++++++++++
 .../contrib/cloud/ops/gcs_config_ops.cc       |  70 ++++++
 .../cloud/python/ops/gcs_config_ops.py        | 176 +++++++++++++++
 tensorflow/contrib/cmake/tf_core_ops.cmake    |   1 +
 tensorflow/contrib/cmake/tf_python.cmake      |   2 +
 tensorflow/core/platform/cloud/BUILD          |   1 +
 .../core/platform/cloud/gcs_file_system.cc    | 113 +++++-----
 .../core/platform/cloud/gcs_file_system.h     |  48 ++++-
 .../platform/cloud/gcs_file_system_test.cc    |   4 +-
 12 files changed, 594 insertions(+), 61 deletions(-)
 create mode 100644 tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
 create mode 100644 tensorflow/contrib/cloud/ops/gcs_config_ops.cc
 create mode 100644 tensorflow/contrib/cloud/python/ops/gcs_config_ops.py

diff --git a/tensorflow/contrib/cloud/BUILD b/tensorflow/contrib/cloud/BUILD
index f3a75e8688..42ba368531 100644
--- a/tensorflow/contrib/cloud/BUILD
+++ b/tensorflow/contrib/cloud/BUILD
@@ -15,7 +15,10 @@ load(
 )
 
 tf_gen_op_libs(
-    op_lib_names = ["bigquery_reader_ops"],
+    op_lib_names = [
+        "bigquery_reader_ops",
+        "gcs_config_ops",
+    ],
     deps = [
         "//tensorflow/core:lib",
     ],
@@ -28,15 +31,25 @@ tf_gen_op_wrapper_py(
     deps = [":bigquery_reader_ops_op_lib"],
 )
 
+tf_gen_op_wrapper_py(
+    name = "gen_gcs_config_ops",
+    out = "python/ops/gen_gcs_config_ops.py",
+    require_shape_functions = True,
+    visibility = ["//tensorflow:internal"],
+    deps = [":gcs_config_ops_op_lib"],
+)
+
 py_library(
     name = "cloud_py",
     srcs = [
         "__init__.py",
         "python/ops/bigquery_reader_ops.py",
+        "python/ops/gcs_config_ops.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":gen_bigquery_reader_ops",
+        ":gen_gcs_config_ops",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:util",
diff --git a/tensorflow/contrib/cloud/__init__.py b/tensorflow/contrib/cloud/__init__.py
index 8870264b95..a6e13ea3ae 100644
--- a/tensorflow/contrib/cloud/__init__.py
+++ b/tensorflow/contrib/cloud/__init__.py
@@ -20,9 +20,15 @@ from __future__ import print_function
 
 # pylint: disable=line-too-long,wildcard-import
 from tensorflow.contrib.cloud.python.ops.bigquery_reader_ops import *
+from tensorflow.contrib.cloud.python.ops.gcs_config_ops import *
 # pylint: enable=line-too-long,wildcard-import
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['BigQueryReader']
+_allowed_symbols = [
+    'BigQueryReader',
+    'ConfigureColabSession',
+    'ConfigureGcs',
+    'ConfigureGcsHook',
+]
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cloud/kernels/BUILD b/tensorflow/contrib/cloud/kernels/BUILD
index ff46f0daa8..40160706f7 100644
--- a/tensorflow/contrib/cloud/kernels/BUILD
+++ b/tensorflow/contrib/cloud/kernels/BUILD
@@ -73,3 +73,17 @@ tf_proto_library(
     srcs = ["bigquery_table_partition.proto"],
     cc_api_version = 2,
 )
+
+tf_kernel_library(
+    name = "gcs_config_ops",
+    srcs = ["gcs_config_ops.cc"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/platform/cloud:curl_http_request",
+        "//tensorflow/core/platform/cloud:gcs_file_system",
+        "//tensorflow/core/platform/cloud:oauth_client",
+        "@jsoncpp_git//:jsoncpp",
+    ],
+)
diff --git a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
new file mode 100644
index 0000000000..ef4998212e
--- /dev/null
+++ b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
@@ -0,0 +1,203 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <sstream>
+
+#include "include/json/json.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/cloud/curl_http_request.h"
+#include "tensorflow/core/platform/cloud/gcs_file_system.h"
+#include "tensorflow/core/platform/cloud/oauth_client.h"
+
+namespace tensorflow {
+namespace {
+
+// The default initial delay between retries with exponential backoff.
+constexpr int kInitialRetryDelayUsec = 500000;  // 0.5 sec
+
+// The minimum time delta between now and the token expiration time
+// for the token to be re-used.
+constexpr int kExpirationTimeMarginSec = 60;
+
+// The URL to retrieve the auth bearer token via OAuth with a refresh token.
+constexpr char kOAuthV3Url[] = "https://www.googleapis.com/oauth2/v3/token";
+
+// The URL to retrieve the auth bearer token via OAuth with a private key.
+constexpr char kOAuthV4Url[] = "https://www.googleapis.com/oauth2/v4/token";
+
+// The authentication token scope to request.
+constexpr char kOAuthScope[] = "https://www.googleapis.com/auth/cloud-platform";
+
+Status RetrieveGcsFs(OpKernelContext* ctx, RetryingGcsFileSystem** fs) {
+  DCHECK(fs != nullptr);
+  *fs = nullptr;
+
+  FileSystem* filesystem = nullptr;
+  TF_RETURN_IF_ERROR(
+      ctx->env()->GetFileSystemForFile("gs://fake/file.text", &filesystem));
+  if (filesystem == nullptr) {
+    return errors::FailedPrecondition("The GCS file system is not registered.");
+  }
+
+  *fs = dynamic_cast<RetryingGcsFileSystem*>(filesystem);
+  if (*fs == nullptr) {
+    return errors::Internal(
+        "The filesystem registered under the 'gs://' scheme was not a "
+        "tensorflow::RetryingGcsFileSystem*.");
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status ParseScalarArgument(OpKernelContext* ctx, StringPiece argument_name,
+                           T* output) {
+  const Tensor* argument_t;
+  TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+  if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+    return errors::InvalidArgument(argument_name, " must be a scalar");
+  }
+  *output = argument_t->scalar<T>()();
+  return Status::OK();
+}
+
+// GcsCredentialsOpKernel overrides the credentials used by the gcs_filesystem.
+class GcsCredentialsOpKernel : public OpKernel {
+ public:
+  explicit GcsCredentialsOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    // Get a handle to the GCS file system.
+    RetryingGcsFileSystem* gcs = nullptr;
+    OP_REQUIRES_OK(ctx, RetrieveGcsFs(ctx, &gcs));
+
+    string json_string;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<string>(ctx, "json", &json_string));
+
+    Json::Value json;
+    Json::Reader reader;
+    std::stringstream json_stream(json_string);
+    OP_REQUIRES(ctx, reader.parse(json_stream, json),
+                errors::InvalidArgument("Could not parse json: ", json_string));
+
+    OP_REQUIRES(
+        ctx, json.isMember("refresh_token") || json.isMember("private_key"),
+        errors::InvalidArgument("JSON format incompatible; did not find fields "
+                                "`refresh_token` or `private_key`."));
+
+    auto provider = absl::make_unique<ConstantAuthProvider>(json, ctx->env());
+
+    // Test getting a token
+    string dummy_token;
+    OP_REQUIRES_OK(ctx, provider->GetToken(&dummy_token));
+    OP_REQUIRES(ctx, !dummy_token.empty(),
+                errors::InvalidArgument(
+                    "Could not retrieve a token with the given credentials."));
+
+    // Set the provider.
+    gcs->underlying()->SetAuthProvider(std::move(provider));
+  }
+
+ private:
+  class ConstantAuthProvider : public AuthProvider {
+   public:
+    ConstantAuthProvider(const Json::Value& json,
+                         std::unique_ptr<OAuthClient> oauth_client, Env* env,
+                         int64 initial_retry_delay_usec)
+        : json_(json),
+          oauth_client_(std::move(oauth_client)),
+          env_(env),
+          initial_retry_delay_usec_(initial_retry_delay_usec) {}
+
+    ConstantAuthProvider(const Json::Value& json, Env* env)
+        : ConstantAuthProvider(json, absl::make_unique<OAuthClient>(), env,
+                               kInitialRetryDelayUsec) {}
+
+    ~ConstantAuthProvider() override {}
+
+    Status GetToken(string* token) override {
+      mutex_lock l(mu_);
+      const uint64 now_sec = env_->NowSeconds();
+
+      if (!current_token_.empty() &&
+          now_sec + kExpirationTimeMarginSec < expiration_timestamp_sec_) {
+        *token = current_token_;
+        return Status::OK();
+      }
+      if (json_.isMember("refresh_token")) {
+        TF_RETURN_IF_ERROR(oauth_client_->GetTokenFromRefreshTokenJson(
+            json_, kOAuthV3Url, &current_token_, &expiration_timestamp_sec_));
+      } else if (json_.isMember("private_key")) {
+        TF_RETURN_IF_ERROR(oauth_client_->GetTokenFromServiceAccountJson(
+            json_, kOAuthV4Url, kOAuthScope, &current_token_,
+            &expiration_timestamp_sec_));
+      } else {
+        return errors::FailedPrecondition(
+            "Unexpected content of the JSON credentials file.");
+      }
+
+      *token = current_token_;
+      return Status::OK();
+    }
+
+   private:
+    Json::Value json_;
+    std::unique_ptr<OAuthClient> oauth_client_;
+    Env* env_;
+
+    mutex mu_;
+    string current_token_ GUARDED_BY(mu_);
+    uint64 expiration_timestamp_sec_ GUARDED_BY(mu_) = 0;
+
+    // The initial delay for exponential backoffs when retrying failed calls.
+    const int64 initial_retry_delay_usec_;
+    TF_DISALLOW_COPY_AND_ASSIGN(ConstantAuthProvider);
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("GcsConfigureCredentials").Device(DEVICE_CPU),
+                        GcsCredentialsOpKernel);
+
+class GcsBlockCacheOpKernel : public OpKernel {
+ public:
+  explicit GcsBlockCacheOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) override {
+    // Get a handle to the GCS file system.
+    RetryingGcsFileSystem* gcs = nullptr;
+    OP_REQUIRES_OK(ctx, RetrieveGcsFs(ctx, &gcs));
+
+    size_t max_cache_size, block_size, max_staleness;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<size_t>(ctx, "max_cache_size",
+                                                    &max_cache_size));
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<size_t>(ctx, "block_size", &block_size));
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<size_t>(ctx, "max_staleness", &max_staleness));
+
+    if (gcs->underlying()->block_size() == block_size &&
+        gcs->underlying()->max_bytes() == max_cache_size &&
+        gcs->underlying()->max_staleness() == max_staleness) {
+      LOG(INFO) << "Skipping resetting the GCS block cache.";
+      return;
+    }
+    gcs->underlying()->ResetFileBlockCache(block_size, max_cache_size,
+                                           max_staleness);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("GcsConfigureBlockCache").Device(DEVICE_CPU),
+                        GcsBlockCacheOpKernel);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/cloud/ops/gcs_config_ops.cc b/tensorflow/contrib/cloud/ops/gcs_config_ops.cc
new file mode 100644
index 0000000000..9cf85f5f18
--- /dev/null
+++ b/tensorflow/contrib/cloud/ops/gcs_config_ops.cc
@@ -0,0 +1,70 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_OP("GcsConfigureCredentials")
+    .Input("json: string")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Configures the credentials used by the GCS client of the local TF runtime.
+
+The json input can be of the format:
+
+1. Refresh Token:
+{
+  "client_id": "<redacted>",
+  "client_secret": "<redacted>",
+  "refresh_token: "<redacted>",
+  "type": "authorized_user",
+}
+
+2. Service Account:
+{
+  "type": "service_account",
+  "project_id": "<redacted>",
+  "private_key_id": "<redacted>",
+  "private_key": "------BEGIN PRIVATE KEY-----\n<REDACTED>\n-----END PRIVATE KEY------\n",
+  "client_email": "<REDACTED>@<REDACTED>.iam.gserviceaccount.com",
+  "client_id": "<REDACTED>",
+  # Some additional fields elided
+}
+
+Note the credentials established through this method are shared across all
+sessions run on this runtime.
+
+Note be sure to feed the inputs to this op to ensure the credentials are not
+stored in a constant op within the graph that might accidentally be checkpointed
+or in other ways be persisted or exfiltrated.
+)doc");
+
+REGISTER_OP("GcsConfigureBlockCache")
+    .Input("max_cache_size: uint64")
+    .Input("block_size: uint64")
+    .Input("max_staleness: uint64")
+    .SetShapeFn(shape_inference::NoOutputs)
+    .Doc(R"doc(
+Re-configures the GCS block cache with the new configuration values.
+
+If the values are the same as already configured values, this op is a no-op. If
+they are different, the current contents of the block cache is dropped, and a
+new block cache is created fresh.
+)doc");
+
+}  // namespace tensorflow
diff --git a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
new file mode 100644
index 0000000000..9ab124ae72
--- /dev/null
+++ b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
@@ -0,0 +1,176 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""GCS file system configuration for TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+
+from tensorflow.contrib.cloud.python.ops import gen_gcs_config_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.training import training
+
+
+# @tf_export('contrib.cloud.BlockCacheParams')
+class BlockCacheParams(object):
+  """BlockCacheParams is a struct used for configuring the GCS Block Cache."""
+
+  def __init__(self, block_size=None, max_bytes=None, max_staleness=None):
+    self._block_size = block_size or 128 * 1024 * 1024
+    self._max_bytes = max_bytes or 2 * self._block_size
+    self._max_staleness = max_staleness or 0
+
+  @property
+  def block_size(self):
+    return self._block_size
+
+  @property
+  def max_bytes(self):
+    return self._max_bytes
+
+  @property
+  def max_staleness(self):
+    return self._max_staleness
+
+
+# @tf_export('contrib.cloud.ConfigureGcsHook')
+class ConfigureGcsHook(training.SessionRunHook):
+  """ConfigureGcsHook configures GCS when used with Estimator/TPUEstimator.
+
+  Example:
+
+  ```
+  sess = tf.Session()
+  refresh_token = raw_input("Refresh token: ")
+  client_secret = raw_input("Client secret: ")
+  client_id = "<REDACTED>"
+  creds = {
+      "client_id": client_id,
+      "refresh_token": refresh_token,
+      "client_secret": client_secret,
+      "type": "authorized_user",
+  }
+  tf.contrib.cloud.configure_gcs(sess, credentials=creds)
+  ```
+
+  """
+
+  def _verify_dictionary(self, creds_dict):
+    if 'refresh_token' in creds_dict or 'private_key' in creds_dict:
+      return True
+    return False
+
+  def __init__(self, credentials=None, block_cache=None):
+    """Constructs a ConfigureGcsHook.
+
+    Args:
+      credentials: A json-formatted string.
+      block_cache: A `BlockCacheParams`
+
+    Raises:
+      ValueError: If credentials is improperly formatted or block_cache is not a
+        BlockCacheParams.
+    """
+    if credentials is not None:
+      if isinstance(credentials, str):
+        try:
+          data = json.loads(credentials)
+        except ValueError as e:
+          raise ValueError('credentials was not a well formed JSON string.', e)
+        if not self._verify_dictionary(data):
+          raise ValueError(
+              'credentials has neither a "refresh_token" nor a "private_key" '
+              'field.')
+      elif isinstance(credentials, dict):
+        if not self._verify_dictionary(credentials):
+          raise ValueError('credentials has neither a "refresh_token" nor a '
+                           '"private_key" field.')
+        credentials = json.dumps(credentials)
+      else:
+        raise ValueError('credentials is of an unknown type')
+
+    self._credentials = credentials
+
+    if block_cache and not isinstance(block_cache, BlockCacheParams):
+      raise ValueError('block_cache must be an instance of BlockCacheParams.')
+    self._block_cache = block_cache
+
+  def begin(self):
+    if self._credentials:
+      self._credentials_placeholder = array_ops.placeholder(dtypes.string)
+      self._credentials_ops = gen_gcs_config_ops.gcs_configure_credentials(
+          self._credentials_placeholder)
+    if self._block_cache:
+      self._block_cache_op = gen_gcs_config_ops.gcs_configure_block_cache(
+          max_cache_size=self._block_cache.max_bytes,
+          block_size=self._block_cache.block_size,
+          max_staleness=self._block_cache.max_staleness)
+
+  def after_create_session(self, session, coord):
+    del coord
+    if self._credentials_op:
+      session.run(
+          self._credentials_op,
+          feed_dict={self._credentials_placeholder: self._credentials})
+    if self._block_cache_op:
+      session.run(self._block_cache_op)
+
+
+def configure_gcs(session, credentials=None, block_cache=None, device=None):
+  """Configures the GCS file system for a given a session.
+
+  Args:
+    session: A `tf.Session` session that should be used to configure the GCS
+      file system.
+    credentials: [Optional.] A JSON string
+    block_cache: [Optional.] A BlockCacheParams to configure the block cache .
+    device: [Optional.] The device to place the configure ops.
+  """
+
+  def configure(credentials, block_cache):
+    """Helper function to actually configure GCS."""
+    if credentials:
+      if isinstance(credentials, dict):
+        credentials = json.dumps(credentials)
+      placeholder = array_ops.placeholder(dtypes.string)
+      op = gen_gcs_config_ops.gcs_configure_credentials(placeholder)
+      session.run(op, feed_dict={placeholder: credentials})
+    if block_cache:
+      op = gen_gcs_config_ops.gcs_configure_block_cache(
+          max_cache_size=block_cache.max_bytes,
+          block_size=block_cache.block_size,
+          max_staleness=block_cache.max_staleness)
+      session.run(op)
+
+  if device:
+    with ops.device(device):
+      return configure(credentials, block_cache)
+  return configure(credentials, block_cache)
+
+
+def configure_colab_session(session):
+  """ConfigureColabSession configures the GCS file system in Colab.
+
+  Args:
+    session: A `tf.Session` session.
+  """
+  # Read from the application default credentials (adc).
+  with open('/content/datalab/adc.json') as f:
+    data = json.load(f)
+  configure_gcs(session, credentials=data)
diff --git a/tensorflow/contrib/cmake/tf_core_ops.cmake b/tensorflow/contrib/cmake/tf_core_ops.cmake
index e558691de4..bc753333db 100644
--- a/tensorflow/contrib/cmake/tf_core_ops.cmake
+++ b/tensorflow/contrib/cmake/tf_core_ops.cmake
@@ -113,6 +113,7 @@ GENERATE_CONTRIB_OP_LIBRARY(tensor_forest_stats "${tensorflow_source_dir}/tensor
 GENERATE_CONTRIB_OP_LIBRARY(text_skip_gram "${tensorflow_source_dir}/tensorflow/contrib/text/ops/skip_gram_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(tpu "${tpu_ops_srcs}")
 GENERATE_CONTRIB_OP_LIBRARY(bigquery_reader "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/bigquery_reader_ops.cc")
+GENERATE_CONTRIB_OP_LIBRARY(gcs_config "${tensorflow_source_dir}/tensorflow/contrib/cloud/ops/gcs_config_ops.cc")
 GENERATE_CONTRIB_OP_LIBRARY(reduce_slice_ops "${tensorflow_source_dir}/tensorflow/contrib/reduce_slice_ops/ops/reduce_slice_ops.cc")
 
 ########################################################
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 8d24a7ae38..61651f3007 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -420,6 +420,8 @@ GENERATE_PYTHON_OP_LIB("contrib_text_skip_gram_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/text/python/ops/gen_skip_gram_ops.py)
 GENERATE_PYTHON_OP_LIB("contrib_bigquery_reader_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_bigquery_reader_ops.py)
+GENERATE_PYTHON_OP_LIB("contrib_gcs_config_ops"
+  DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/cloud/python/ops/gen_gcs_config_ops.py)
 GENERATE_PYTHON_OP_LIB("stateless_random_ops"
   DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/contrib/stateless/gen_stateless_random_ops.py)
 GENERATE_PYTHON_OP_LIB("debug_ops"
diff --git a/tensorflow/core/platform/cloud/BUILD b/tensorflow/core/platform/cloud/BUILD
index 0fc1e4ae45..67651349ea 100644
--- a/tensorflow/core/platform/cloud/BUILD
+++ b/tensorflow/core/platform/cloud/BUILD
@@ -174,6 +174,7 @@ cc_library(
         "oauth_client.h",
     ],
     copts = tf_copts(),
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":curl_http_request",
         ":http_request",
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index dc12c78a4b..632bb32063 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -290,51 +290,24 @@ Status GetBoolValue(const Json::Value& parent, const char* name, bool* result) {
 /// A GCS-based implementation of a random access file with an LRU block cache.
 class GcsRandomAccessFile : public RandomAccessFile {
  public:
-  using SignatureGenFun =
-      std::function<Status(const string& filename, int64* file_signature)>;
+  using ReadFn =
+      std::function<Status(const string& filename, uint64 offset, size_t n,
+                           StringPiece* result, char* scratch)>;
 
-  GcsRandomAccessFile(const string& filename, FileBlockCache* file_block_cache,
-                      const SignatureGenFun& signature_gen_fun)
-      : filename_(filename),
-        file_block_cache_(file_block_cache),
-        signature_gen_fun_(signature_gen_fun) {}
+  GcsRandomAccessFile(const string& filename, ReadFn read_fn)
+      : filename_(filename), read_fn_(std::move(read_fn)) {}
 
   /// The implementation of reads with an LRU block cache. Thread safe.
   Status Read(uint64 offset, size_t n, StringPiece* result,
               char* scratch) const override {
-    if (file_block_cache_->IsCacheEnabled()) {
-      int64 signature;
-      TF_RETURN_IF_ERROR(signature_gen_fun_(filename_, &signature));
-      if (!file_block_cache_->ValidateAndUpdateFileSignature(filename_,
-                                                             signature)) {
-        VLOG(1) << "File " << filename_
-                << " signature has been changed. Refreshing the cache.";
-      }
-    }
-
-    *result = StringPiece();
-    size_t bytes_transferred;
-    TF_RETURN_IF_ERROR(file_block_cache_->Read(filename_, offset, n, scratch,
-                                               &bytes_transferred));
-    *result = StringPiece(scratch, bytes_transferred);
-
-    if (bytes_transferred < n) {
-      // This is not an error per se. The RandomAccessFile interface expects
-      // that Read returns OutOfRange if fewer bytes were read than requested.
-      return errors::OutOfRange("EOF reached, ", result->size(),
-                                " bytes were read out of ", n,
-                                " bytes requested.");
-    }
-    return Status::OK();
+    return read_fn_(filename_, offset, n, result, scratch);
   }
 
  private:
   /// The filename of this file.
   const string filename_;
-  /// The LRU block cache for this file.
-  mutable FileBlockCache* file_block_cache_;  // not owned
-
-  const SignatureGenFun signature_gen_fun_;
+  /// The implementation of the read operation (provided by the GCSFileSystem).
+  const ReadFn read_fn_;
 };
 
 /// \brief GCS-based implementation of a writeable file.
@@ -797,21 +770,50 @@ Status GcsFileSystem::NewRandomAccessFile(
     const string& fname, std::unique_ptr<RandomAccessFile>* result) {
   string bucket, object;
   TF_RETURN_IF_ERROR(ParseGcsPath(fname, false, &bucket, &object));
-  result->reset(new GcsRandomAccessFile(
-      fname, file_block_cache_.get(),
-      [this, bucket, object](const string& fname, int64* signature) {
-        GcsFileStat stat;
-        TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
-            fname, &stat,
-            [this, bucket, object](const string& fname, GcsFileStat* stat) {
-              return UncachedStatForObject(fname, bucket, object, stat);
-            }));
-        *signature = stat.generation_number;
-        return Status::OK();
-      }));
+  result->reset(new GcsRandomAccessFile(fname, [this, bucket, object](
+                                                   const string& fname,
+                                                   uint64 offset, size_t n,
+                                                   StringPiece* result,
+                                                   char* scratch) {
+    tf_shared_lock l(block_cache_lock_);
+    if (file_block_cache_->IsCacheEnabled()) {
+      GcsFileStat stat;
+      TF_RETURN_IF_ERROR(stat_cache_->LookupOrCompute(
+          fname, &stat,
+          [this, bucket, object](const string& fname, GcsFileStat* stat) {
+            return UncachedStatForObject(fname, bucket, object, stat);
+          }));
+      if (!file_block_cache_->ValidateAndUpdateFileSignature(
+              fname, stat.generation_number)) {
+        VLOG(1)
+            << "File signature has been changed. Refreshing the cache. Path: "
+            << fname;
+      }
+    }
+    *result = StringPiece();
+    size_t bytes_transferred;
+    TF_RETURN_IF_ERROR(
+        file_block_cache_->Read(fname, offset, n, scratch, &bytes_transferred));
+    *result = StringPiece(scratch, bytes_transferred);
+    if (bytes_transferred < n) {
+      return errors::OutOfRange("EOF reached, ", result->size(),
+                                " bytes were read out of ", n,
+                                " bytes requested.");
+    }
+    return Status::OK();
+  }));
   return Status::OK();
 }
 
+void GcsFileSystem::ResetFileBlockCache(size_t block_size_bytes,
+                                        size_t max_bytes,
+                                        uint64 max_staleness_secs) {
+  mutex_lock l(block_cache_lock_);
+  file_block_cache_ =
+      MakeFileBlockCache(block_size_bytes, max_bytes, max_staleness_secs);
+  stats_->Configure(this, &throttle_, file_block_cache_.get());
+}
+
 // A helper function to build a FileBlockCache for GcsFileSystem.
 std::unique_ptr<FileBlockCache> GcsFileSystem::MakeFileBlockCache(
     size_t block_size, size_t max_bytes, uint64 max_staleness) {
@@ -880,6 +882,7 @@ Status GcsFileSystem::LoadBufferFromGCS(const string& filename, size_t offset,
 }
 
 void GcsFileSystem::ClearFileCaches(const string& fname) {
+  tf_shared_lock l(block_cache_lock_);
   file_block_cache_->RemoveFile(fname);
   stat_cache_->Delete(fname);
   // TODO(rxsang): Remove the patterns that matche the file in
@@ -1509,6 +1512,7 @@ Status GcsFileSystem::DeleteRecursively(const string& dirname,
 // reclaiming memory once filesystem operations are done (e.g. model is loaded),
 // or for resetting the filesystem to a consistent state.
 void GcsFileSystem::FlushCaches() {
+  tf_shared_lock l(block_cache_lock_);
   file_block_cache_->Flush();
   stat_cache_->Clear();
   matching_paths_cache_->Clear();
@@ -1517,8 +1521,15 @@ void GcsFileSystem::FlushCaches() {
 void GcsFileSystem::SetStats(GcsStatsInterface* stats) {
   CHECK(stats_ == nullptr) << "SetStats() has already been called.";
   CHECK(stats != nullptr);
+  mutex_lock l(block_cache_lock_);
   stats_ = stats;
-  stats_->Init(this, &throttle_, file_block_cache_.get());
+  stats_->Configure(this, &throttle_, file_block_cache_.get());
+}
+
+void GcsFileSystem::SetAuthProvider(
+    std::unique_ptr<AuthProvider> auth_provider) {
+  mutex_lock l(mu_);
+  auth_provider_ = std::move(auth_provider);
 }
 
 // Creates an HttpRequest and sets several parameters that are common to all
@@ -1531,7 +1542,11 @@ Status GcsFileSystem::CreateHttpRequest(std::unique_ptr<HttpRequest>* request) {
   }
 
   string auth_token;
-  TF_RETURN_IF_ERROR(AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+  {
+    tf_shared_lock l(mu_);
+    TF_RETURN_IF_ERROR(
+        AuthProvider::GetToken(auth_provider_.get(), &auth_token));
+  }
 
   new_request->AddAuthBearerHeader(auth_token);
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system.h b/tensorflow/core/platform/cloud/gcs_file_system.h
index d543db1577..74768c98b5 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.h
+++ b/tensorflow/core/platform/cloud/gcs_file_system.h
@@ -43,9 +43,12 @@ class GcsFileSystem;
 /// time.
 class GcsStatsInterface {
  public:
-  /// Init is called by the GcsFileSystem immediately after being registered.
-  virtual void Init(GcsFileSystem* fs, GcsThrottle* throttle,
-                    const FileBlockCache* block_cache) = 0;
+  /// Configure is called by the GcsFileSystem to provide instrumentation hooks.
+  ///
+  /// Note: Configure can be called multiple times (e.g. if the block cache is
+  /// re-initialized).
+  virtual void Configure(GcsFileSystem* fs, GcsThrottle* throttle,
+                         const FileBlockCache* block_cache) = 0;
 
   /// RecordBlockLoadRequest is called to record a block load request is about
   /// to be made.
@@ -132,9 +135,18 @@ class GcsFileSystem : public FileSystem {
 
   /// These accessors are mainly for testing purposes, to verify that the
   /// environment variables that control these parameters are handled correctly.
-  size_t block_size() const { return file_block_cache_->block_size(); }
-  size_t max_bytes() const { return file_block_cache_->max_bytes(); }
-  uint64 max_staleness() const { return file_block_cache_->max_staleness(); }
+  size_t block_size() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->block_size();
+  }
+  size_t max_bytes() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->max_bytes();
+  }
+  uint64 max_staleness() {
+    tf_shared_lock l(block_cache_lock_);
+    return file_block_cache_->max_staleness();
+  }
   TimeoutConfig timeouts() const { return timeouts_; }
   string additional_header_name() const {
     return additional_header_ ? additional_header_->first : "";
@@ -190,6 +202,21 @@ class GcsFileSystem : public FileSystem {
 
   Status CreateHttpRequest(std::unique_ptr<HttpRequest>* request);
 
+  /// \brief Sets a new AuthProvider on the GCS FileSystem.
+  ///
+  /// The new auth provider will be used for all subsequent requests.
+  void SetAuthProvider(std::unique_ptr<AuthProvider> auth_provider);
+
+  /// \brief Resets the block cache and re-instantiates it with the new values.
+  ///
+  /// This method can be used to clear the existing block cache and/or to
+  /// re-configure the block cache for different values.
+  ///
+  /// Note: the existing block cache is not cleaned up until all existing files
+  /// have been closed.
+  void ResetFileBlockCache(size_t block_size_bytes, size_t max_bytes,
+                           uint64 max_staleness_secs);
+
  private:
   // GCS file statistics.
   struct GcsFileStat {
@@ -246,9 +273,14 @@ class GcsFileSystem : public FileSystem {
   // Clear all the caches related to the file with name `filename`.
   void ClearFileCaches(const string& fname);
 
-  std::unique_ptr<AuthProvider> auth_provider_;
+  mutex mu_;
+  std::unique_ptr<AuthProvider> auth_provider_ GUARDED_BY(mu_);
   std::unique_ptr<HttpRequest::Factory> http_request_factory_;
-  std::unique_ptr<FileBlockCache> file_block_cache_;
+  // block_cache_lock_ protects the file_block_cache_ pointer (Note that
+  // FileBlockCache instances are themselves threadsafe).
+  mutex block_cache_lock_;
+  std::unique_ptr<FileBlockCache> file_block_cache_
+      GUARDED_BY(block_cache_lock_);
   std::unique_ptr<GcsDnsCache> dns_cache_;
   GcsThrottle throttle_;
 
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 3f73b238ad..6a28d9162f 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -2946,8 +2946,8 @@ TEST(GcsFileSystemTest, CreateHttpRequest) {
 
 class TestGcsStats : public GcsStatsInterface {
  public:
-  void Init(GcsFileSystem* fs, GcsThrottle* throttle,
-            const FileBlockCache* block_cache) override {
+  void Configure(GcsFileSystem* fs, GcsThrottle* throttle,
+                 const FileBlockCache* block_cache) override {
     CHECK(fs_ == nullptr);
     CHECK(throttle_ == nullptr);
     CHECK(block_cache_ == nullptr);
-- 
GitLab


From d15f77048558a7af16648146faca1c5d13d8d6e1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 14:55:54 -0700
Subject: [PATCH 070/610] Move RemoveInvolution optimization to optimizer
 stage.

PiperOrigin-RevId: 198624394
---
 .../optimizers/arithmetic_optimizer.cc        |  75 ++++++----
 .../optimizers/arithmetic_optimizer.h         |  14 +-
 .../optimizers/arithmetic_optimizer_test.cc   | 130 ++++++++++--------
 3 files changed, 128 insertions(+), 91 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 060e4200af..9c18c45f18 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1162,10 +1162,8 @@ class MinimizeBroadcasts : public ArithmeticNodesGroupOptimizerStage {
 class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
  public:
   explicit RemoveIdentityTranspose(const GraphOptimizerContext& ctx,
-                                   const ArithmeticOptimizerContext& ctx_ext,
-                                   RewriterConfig::Toggle opt_level)
-      : ArithmeticOptimizerStage("RemoveIdentityTranspose", ctx, ctx_ext),
-        opt_level_(opt_level) {}
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveIdentityTranspose", ctx, ctx_ext) {}
   ~RemoveIdentityTranspose() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
@@ -1260,8 +1258,47 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     }
     return true;
   }
+};
+
+// An involution is an element-wise function f(x) that is its own inverse,
+// i.e. f(f(x)) = x. If we can find a chain of ops
+//   f->op1->op2->...opn->f
+// where op1 through opn preserve the values of their inputs, we can remove
+// the two instances of the involution from the graph, since they cancel
+// each other.
+class RemoveInvolution : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveInvolution(const GraphOptimizerContext& ctx,
+                            const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveInvolution", ctx, ctx_ext) {}
+  ~RemoveInvolution() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsInvolution(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* tail = GetTailOfValuePreservingChain(*node, *ctx().node_map,
+                                                  *ctx().nodes_to_preserve);
+
+    NodeDef* involution;
+    TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &involution));
+
+    if (involution->op() == node->op()) {
+      // Skip both *node and *involution since they cancel each other.
+      if (tail == node) {
+        // The two nodes to eliminate are adjacent.
+        *simplified_node_name = involution->input(0);
+      } else {
+        tail->set_input(0, involution->input(0));
+        ctx().node_map->UpdateInput(tail->name(), involution->name(),
+                                    involution->input(0));
+        *simplified_node_name = node->input(0);
+      }
+    }
 
-  RewriterConfig::Toggle opt_level_;
+    return Status::OK();
+  }
 };
 
 // Remove redundant Bitcasts.
@@ -2071,30 +2108,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
 // ArithmeticOptimizerStage
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-  // Remove involutions applied twice.
-  if (IsInvolution(*node)) {
-    // An involution is an element-wise function f(x) that is its own inverse,
-    // i.e. f(f(x)) = x. If we can find a chain of ops
-    //   f->op1->op2->...opn->f
-    // where op1 through opn preserve the values of their inputs, we can remove
-    // the two instances of the involution from the graph, since they cancel
-    // each other.
-    NodeDef* tail =
-        GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_);
-    NodeDef* involution = node_map_->GetNode(tail->input(0));
-    if (involution->op() == node->op()) {
-      // Skip both *node and *involution since they cancel each other.
-      if (tail == node) {
-        // The two nodes to eliminate are adjacent.
-        return involution->input(0);
-      } else {
-        tail->set_input(0, involution->input(0));
-        node_map_->UpdateInput(tail->name(), involution->name(),
-                               involution->input(0));
-        return node->input(0);
-      }
-    }
-  }
 
   if (node->op() == "Reshape") {
     //   Reshape
@@ -2463,7 +2476,9 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   if (options_.minimize_broadcasts && can_use_shapes)
     pipeline.AddStage<MinimizeBroadcasts>(ctx, ctx_ext);
   if (options_.remove_identity_transpose && can_use_shapes)
-    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext, opt_level_);
+    pipeline.AddStage<RemoveIdentityTranspose>(ctx, ctx_ext);
+  if (options_.remove_involution)
+    pipeline.AddStage<RemoveInvolution>(ctx, ctx_ext);
   if (options_.remove_redundant_bitcast)
     pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
   if (options_.remove_redundant_cast)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 8e1b3eda3b..962399119d 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -56,19 +56,21 @@ class ArithmeticOptimizer : public GraphOptimizer {
   struct ArithmeticOptimizerOptions {
     // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
     // Remove when all optimizers will be migrated to separate stages.
-    bool dedup_computations = true;
     bool enable_try_simplify_and_replace = true;
+
     bool combine_add_to_addn = true;
+    bool convert_sqrt_div_to_rsqrt_mul = false;
+    bool dedup_computations = true;
     bool hoist_common_factor_out_of_aggregation = true;
+    bool hoist_cwise_unary_chains = false;
     bool minimize_broadcasts = true;
+    bool remove_idempotent = true;
     bool remove_identity_transpose = true;
+    bool remove_involution = true;
+    bool remove_logical_not = true;
+    bool remove_negation = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
-    bool remove_negation = true;
-    bool hoist_cwise_unary_chains = false;
-    bool convert_sqrt_div_to_rsqrt_mul = false;
-    bool remove_idempotent = true;
-    bool remove_logical_not = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 64fdc8a83b..a908416e45 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -115,12 +115,17 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.dedup_computations = false;
     options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
+    options.convert_sqrt_div_to_rsqrt_mul = false;
     options.hoist_common_factor_out_of_aggregation = false;
+    options.hoist_cwise_unary_chains = false;
     options.minimize_broadcasts = false;
     options.remove_identity_transpose = false;
+    options.remove_involution = false;
+    options.remove_idempotent = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
     options.remove_negation = false;
+    options.remove_logical_not = false;
     optimizer->options_ = options;
   }
 
@@ -148,6 +153,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.remove_identity_transpose = true;
   }
 
+  void EnableOnlyRemoveInvolution(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_involution = true;
+  }
+
   void EnableOnlyRemoveRedundantBitcast(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_redundant_bitcast = true;
@@ -338,100 +348,110 @@ TEST_F(ArithmeticOptimizerTest, MulToSquare) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsReal) {
+TEST_F(ArithmeticOptimizerTest, RemoveInvolution_AdjacentNodes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
-  Output neg1 = ops::Neg(s.WithOpName("neg1"), c);
-  Output neg2 = ops::Neg(s.WithOpName("neg2"), neg1);
-  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), neg2);
-  Output recip2 = ops::Reciprocal(s.WithOpName("recip2"), recip1);
-  Output id = ops::Identity(s.WithOpName("id"), recip2);
+
+  auto c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  auto neg1 = ops::Neg(s.WithOpName("neg1"), c);
+  auto neg2 = ops::Neg(s.WithOpName("neg2"), neg1);
+  auto recip1 = ops::Reciprocal(s.WithOpName("recip1"), neg2);
+  auto recip2 = ops::Reciprocal(s.WithOpName("recip2"), recip1);
+  auto id = ops::Identity(s.WithOpName("id"), recip2);
+
+  std::vector<string> fetch = {"id"};
+
   GrapplerItem item;
+  item.fetch = fetch;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"id"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveInvolution(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(6, output.node_size());
+  // Negation and Reciprocal nodes cancelled each other.
+  EXPECT_EQ(2, output.node_size());
+  EXPECT_EQ("id", output.node(1).name());
   EXPECT_EQ("c", output.node(1).input(0));
-  EXPECT_EQ("c", output.node(3).input(0));
-  EXPECT_EQ("c", output.node(5).input(0));
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithChain) {
+TEST_F(ArithmeticOptimizerTest, RemoveInvolution_AroundValuePreservingChain) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
-  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), recip1);
-  Output squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
-  Output recip2 = ops::Reciprocal(s.WithOpName("recip2"), squeeze);
-  Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
+
+  auto c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  auto recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
+  auto id1 = ops::Identity(s.WithOpName("id1"), recip1);
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
+  auto recip2 = ops::Reciprocal(s.WithOpName("recip2"), squeeze);
+  auto id2 = ops::Identity(s.WithOpName("id2"), recip2);
+
+  std::vector<string> fetch = {"id2"};
+
   GrapplerItem item;
+  item.fetch = fetch;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"id2"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveInvolution(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(6, output.node_size());
-  EXPECT_EQ("squeeze", output.node(5).input(0));
-  EXPECT_EQ("c", output.node(2).input(0));
+  // Check that Reciprocal nodes were removed from the graph.
+  EXPECT_EQ(3, output.node_size());
+
+  // And const directly flows into squeeze.
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "squeeze") {
+      EXPECT_EQ("c", node.input(0));
+      found++;
+    } else if (node.name() == "id2") {
+      EXPECT_EQ("squeeze", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(2, found);
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, SimplifyInvolutionsWithControlChain) {
+TEST_F(ArithmeticOptimizerTest, RemoveInvolution_SkipControlDependencies) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
-  Output recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
-  Output id1 = ops::Identity(s.WithOpName("id1"), recip1);
-  Output squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
-  Output recip2 = ops::Reciprocal(
+
+  auto c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
+  auto recip1 = ops::Reciprocal(s.WithOpName("recip1"), c);
+  auto id1 = ops::Identity(s.WithOpName("id1"), recip1);
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), id1);
+  auto recip2 = ops::Reciprocal(
       s.WithOpName("recip2").WithControlDependencies(squeeze), c);
-  Output id2 = ops::Identity(s.WithOpName("id2"), recip2);
+  auto id2 = ops::Identity(s.WithOpName("id2"), recip2);
+
+  std::vector<string> fetch = {"id2"};
+
   GrapplerItem item;
+  item.fetch = fetch;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch = {"id2"};
   auto tensors_expected = EvaluateNodes(item.graph, fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveInvolution(&optimizer);
+  OptimizeTwice(&optimizer, &item, &output);  // do not prune in this test
 
   // The optimizer should be a noop.
-  EXPECT_EQ(item.graph.node_size(), output.node_size());
-  for (int i = 0; i < item.graph.node_size(); ++i) {
-    const NodeDef& original = item.graph.node(i);
-    const NodeDef& optimized = output.node(i);
-    EXPECT_EQ(original.name(), optimized.name());
-    EXPECT_EQ(original.op(), optimized.op());
-    EXPECT_EQ(original.input_size(), optimized.input_size());
-    for (int j = 0; j < original.input_size(); ++j) {
-      EXPECT_EQ(original.input(j), optimized.input(j));
-    }
-  }
+  VerifyGraphsMatch(item.graph, output, __LINE__);
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
@@ -2777,7 +2797,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveLogicalNot) {
   ArithmeticOptimizer optimizer;
   EnableOnlyRemoveLogicalNot(&optimizer);
   OptimizeTwice(&optimizer, &item, &output);
-  LOG(INFO) << output.DebugString();
+
   int found = 0;
   for (const NodeDef& node : output.node()) {
     if (node.name() == "id_not_eq") {
-- 
GitLab


From 1962f0c5dd9096f6e198458e248abb78c50e402e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 15:03:25 -0700
Subject: [PATCH 071/610] Add kwargs support for tpu.outside_compilation

PiperOrigin-RevId: 198625799
---
 tensorflow/contrib/tpu/python/tpu/tpu.py           | 8 +++++---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 7d165fdd6e..612cd0114b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -320,13 +320,15 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     return None
 
 
-def outside_compilation(computation, args=None):
+def outside_compilation(computation, *args, **kwargs):
   """Builds part of a computation outside any current TPU replicate scope.
 
   Args:
     computation: A Python function that builds the computation to
       place on the host.
-    args: Inputs to pass to computation.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
   Returns:
     The Tensors returned by computation.
   """
@@ -342,7 +344,7 @@ def outside_compilation(computation, args=None):
       context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
     context = context.outer_context
 
-  retval = computation(*args)
+  retval = computation(*args, **kwargs)
 
   # If we are in a TPUReplicateContext, signal that we are no longer
   # outside_compilation
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index aea9949290..aeb7ba536f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1806,7 +1806,7 @@ class TPUEstimator(estimator_lib.Estimator):
       export_outputs['classes'] =
         export_output_lib.ClassificationOutput(classes=classes)
 
-    tpu.outside_compilation(host_call, [logits])
+    tpu.outside_compilation(host_call, logits)
 
     ...
   ```
-- 
GitLab


From a317dfaf282bb5a000fecde8dbb9db3812370bd2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 15:24:17 -0700
Subject: [PATCH 072/610] Avoid recursion in ExpandDomain() as stack is not
 happy.

PiperOrigin-RevId: 198629366
---
 .../compiler/xla/service/hlo_domain_map.cc    | 56 +++++++++++--------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_domain_map.cc b/tensorflow/compiler/xla/service/hlo_domain_map.cc
index acb54c260c..ebd5adb5d5 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_map.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_map.cc
@@ -93,31 +93,39 @@ Status HloDomainMap::InsertDomain(
 
 Status HloDomainMap::ExpandDomain(HloInstruction* instruction,
                                   DomainMetadata::Domain* domain) const {
-  if (domain->reach_set.insert(instruction).second) {
-    // We should not be finding instructions with assigned domain here.
-    // If we assigned a domain to the instruction, it means that all the
-    // instructions reached by it, should have a domain as well.
-    int64 domain_id = FindOrDefault(instruction_to_domain_, instruction, -1);
-    TF_RET_CHECK(domain_id < 0) << "Instruction " << instruction->ToString()
-                                << " already has domain " << domain_id;
-    for (HloInstruction* operand : instruction->operands()) {
-      if (IsDomainInstruction(operand)) {
-        // The reach set instruction is a user of the domain instruction
-        // (the instruction sees the kDomain as operand).
-        // IOW the dataflow enters the domain through the kDomain instruction.
-        domain->enter_domains.insert(operand);
-      } else {
-        TF_RETURN_IF_ERROR(ExpandDomain(operand, domain));
+  std::vector<HloInstruction*> in_queue;
+  in_queue.push_back(instruction);
+  while (!in_queue.empty()) {
+    HloInstruction* current_instruction = in_queue.back();
+    in_queue.pop_back();
+    if (domain->reach_set.insert(current_instruction).second) {
+      // We should not be finding instructions with assigned domain here.
+      // If we assigned a domain to the instruction, it means that all the
+      // instructions reached by it, should have a domain as well.
+      int64 domain_id =
+          FindOrDefault(instruction_to_domain_, current_instruction, -1);
+      TF_RET_CHECK(domain_id < 0)
+          << "Instruction " << current_instruction->ToString()
+          << " already has domain " << domain_id;
+      for (HloInstruction* operand : current_instruction->operands()) {
+        if (IsDomainInstruction(operand)) {
+          // The reach set instruction is a user of the domain instruction
+          // (the instruction sees the kDomain as operand).
+          // IOW the dataflow enters the domain through the kDomain instruction.
+          domain->enter_domains.insert(operand);
+        } else {
+          in_queue.push_back(operand);
+        }
       }
-    }
-    for (HloInstruction* user : instruction->users()) {
-      if (IsDomainInstruction(user)) {
-        // The reach set instruction is an operand of the domain instruction
-        // (the instruction sees the kDomain as user).
-        // IOW the dataflow exits the domain through the kDomain instruction.
-        domain->exit_domains.insert(user);
-      } else {
-        TF_RETURN_IF_ERROR(ExpandDomain(user, domain));
+      for (HloInstruction* user : current_instruction->users()) {
+        if (IsDomainInstruction(user)) {
+          // The reach set instruction is an operand of the domain instruction
+          // (the instruction sees the kDomain as user).
+          // IOW the dataflow exits the domain through the kDomain instruction.
+          domain->exit_domains.insert(user);
+        } else {
+          in_queue.push_back(user);
+        }
       }
     }
   }
-- 
GitLab


From 26253b108d453c48fe106d394c3d861468d3bfe5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 15:39:03 -0700
Subject: [PATCH 073/610] Add HloProto support to replay_computation

PiperOrigin-RevId: 198631733
---
 .../compiler/xla/tools/replay_computation.cc       | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 2349fa919e..fc7e8002c7 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -83,7 +83,7 @@ struct Options {
 StatusOr<std::unique_ptr<Literal>> ReplayComputation(const HloSnapshot& module,
                                                      Client* client,
                                                      const Options& opts) {
-  TF_ASSIGN_OR_RETURN(auto computation, client->LoadSnapshot(module));
+  XlaComputation computation(module.hlo().hlo_module());
 
   std::vector<std::unique_ptr<GlobalData>> arguments;
   if (opts.use_fake_data) {
@@ -192,9 +192,15 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
     HloSnapshot snapshot;
     auto status = tensorflow::ReadBinaryProto(env, arg, &snapshot);
     if (!status.ok()) {
-      fprintf(stderr, "%s: is not HloSnapshot: %s.\n", arg,
-              status.ToString().c_str());
-      continue;
+      fprintf(stderr, "%s: is not HloSnapshot. Trying HloProto.\n", arg);
+      status = tensorflow::ReadBinaryProto(env, arg, snapshot.mutable_hlo());
+      if (!status.ok()) {
+        fprintf(stderr, "%s: is not HloSnapshot or HloProto: %s.\n", arg,
+                status.ToString().c_str());
+        continue;
+      }
+      CHECK(opts.use_fake_data)
+          << "HloProto input must be handled with --use_fake_data";
     }
     StatusOr<std::unique_ptr<Literal>> result_status =
         ReplayComputation(snapshot, client, opts);
-- 
GitLab


From 7b5d04c60437a415fc4edb5a97d939a1a3babe14 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 30 May 2018 15:50:43 -0700
Subject: [PATCH 074/610] Makes most variable writes depend on the cached
 value.

This disallows some undefined behavior with unordered reads and writes.

PiperOrigin-RevId: 198633444
---
 .../resource_variable_ops_test.py             |  7 ++++++
 .../python/ops/resource_variable_ops.py       | 23 ++++++++++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 846231fe81..972fbdb3d6 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -119,6 +119,13 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertGreater(len(handle.eval()), 0)
 
+  def testCachedValueReadBeforeWrite(self):
+    with self.test_session() as sess:
+      v = resource_variable_ops.ResourceVariable(0.0, caching_device="cpu:0")
+      sess.run(v.initializer)
+      value, _ = sess.run([v, v.assign_add(1.0)])
+      self.assertAllEqual(value, 0.0)
+
   def testAssignVariableDtypeMismatchEager(self):
     with context.eager_mode():
       handle = resource_variable_ops.var_handle_op(
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index e5b80200c0..e37e93ea35 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -576,6 +576,21 @@ class ResourceVariable(variables.Variable):
     self._constraint = None
     self._cached_shape_as_list = None
 
+  @contextlib.contextmanager
+  def _assign_dependencies(self):
+    """Makes assignments depend on the cached value, if any.
+
+    This prevents undefined behavior with reads not ordered wrt writes.
+
+    Yields:
+      None.
+    """
+    if self._cached_value is not None:
+      with ops.control_dependencies([self._cached_value]):
+        yield
+    else:
+      yield
+
   def __nonzero__(self):
     return self.__bool__()
 
@@ -865,7 +880,7 @@ class ResourceVariable(variables.Variable):
     # TODO(apassos): this here and below is not atomic. Consider making it
     # atomic if there's a way to do so without a performance cost for those who
     # don't need it.
-    with _handle_graph(self.handle):
+    with _handle_graph(self.handle), self._assign_dependencies():
       assign_sub_op = gen_resource_variable_ops.assign_sub_variable_op(
           self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
           name=name)
@@ -889,7 +904,7 @@ class ResourceVariable(variables.Variable):
       it will return the `Operation` that does the assignment, and when in eager
       mode it will return `None`.
     """
-    with _handle_graph(self.handle):
+    with _handle_graph(self.handle), self._assign_dependencies():
       assign_add_op = gen_resource_variable_ops.assign_add_variable_op(
           self.handle, ops.convert_to_tensor(delta, dtype=self.dtype),
           name=name)
@@ -921,6 +936,8 @@ class ResourceVariable(variables.Variable):
       it will return the `Operation` that does the assignment, and when in eager
       mode it will return `None`.
     """
+    # Note: not depending on the cached value here since this can used to
+    # initialize the variable.
     with _handle_graph(self.handle):
       value_tensor = ops.convert_to_tensor(value, dtype=self.dtype)
       self._shape.assert_is_compatible_with(value_tensor.shape)
@@ -933,7 +950,7 @@ class ResourceVariable(variables.Variable):
   def _strided_slice_assign(self, begin, end, strides, value, name, begin_mask,
                             end_mask, ellipsis_mask, new_axis_mask,
                             shrink_axis_mask):
-    with _handle_graph(self.handle):
+    with _handle_graph(self.handle), self._assign_dependencies():
       return self._lazy_read(
           gen_array_ops.resource_strided_slice_assign(
               ref=self.handle,
-- 
GitLab


From 631d354b5b4959709dd16790ea9b1b9166ec10e2 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 30 May 2018 16:00:26 -0700
Subject: [PATCH 075/610] Remove environment variable to disable C API.

This is staging for removing the _USE_C_API toggle altogether.

PiperOrigin-RevId: 198634886
---
 tensorflow/python/framework/ops.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 3af0cc44a8..6f3bb5563b 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -59,11 +59,9 @@ from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
 
-# Temporary global switch determining if we should enable the work-in-progress
-# calls to the C API. Currently disabled by default but can be manually enabled
-# in code or via the environment variable. This will be removed once all
-# functionality is supported and there's no performance penalty with it enabled.
-_USE_C_API = os.getenv("TF_C_API_GRAPH_CONSTRUCTION", "1") is not "0"
+# Temporary global switches determining if we should enable the work-in-progress
+# calls to the C API. These will be removed once all functionality is supported.
+_USE_C_API = True
 _USE_C_SHAPES = os.getenv("TF_C_API_GRAPH_CONSTRUCTION_SHAPES", "0") is not "0"
 
 
-- 
GitLab


From 9285727b93b6f6d66af0fe10077ad01257e18cf1 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 30 May 2018 16:05:33 -0700
Subject: [PATCH 076/610] Fix setuptools version to avoid a bad release.

---
 tensorflow/tools/ci_build/install/install_pip_packages.sh      | 3 +++
 .../tools/ci_build/install/install_python3.5_pip_packages.sh   | 2 +-
 .../tools/ci_build/install/install_python3.6_pip_packages.sh   | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index 982161cefe..bd6c50bce9 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -21,6 +21,9 @@ set -e
 easy_install -U pip==9.0.3
 easy_install3 -U pip==9.0.3
 
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 204a82f647..0844c48980 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,7 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools
+pip3.5 install --upgrade setuptools==39.1.0
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 275abeb669..fb183b0e4f 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,7 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools
+pip3 install --upgrade setuptools==39.1.0
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
-- 
GitLab


From 8126c1d4c6df8029823a462a81186a64a1658384 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 16:01:05 -0700
Subject: [PATCH 077/610] Makes empty() support uint8 on cpu.

PiperOrigin-RevId: 198634986
---
 tensorflow/core/kernels/inplace_ops.cc             |  1 +
 tensorflow/python/kernel_tests/inplace_ops_test.py | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index ef6ce0546b..8f51cc3819 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -476,6 +476,7 @@ REGISTER_EMPTY(string, CPU)
 REGISTER_EMPTY(int32, CPU)
 REGISTER_EMPTY(int64, CPU)
 REGISTER_EMPTY(bool, CPU)
+REGISTER_EMPTY(uint8, CPU)
 
 #if GOOGLE_CUDA
 
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index 0f95e13187..6e894365af 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -166,7 +166,8 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
 
   def testEmpty(self):
     for dtype in [
-        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool
+        dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
+        dtypes.uint8
     ]:
       with self.test_session(use_gpu=True):
         test_shapes = [(), (1,), (2, 3), (0, 2), (2, 3, 5), (2, 0, 5)]
@@ -187,11 +188,12 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
           self.assertEqual(val.dtype, dtype.as_numpy_dtype)
           self.assertAllEqual(val, np.zeros(shape, dtype.as_numpy_dtype))
 
-        val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
-        self.assertEqual(val.tolist(), [[b"", b""]])
+    with self.test_session(use_gpu=True):
+      val = inplace_ops.empty((1, 2), dtypes.string, init=True).eval()
+      self.assertEqual(val.tolist(), [[b"", b""]])
 
-        val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
-        self.assertEqual(val.tolist(), [[b"", b""]])
+      val = inplace_ops.empty((1, 2), dtypes.string, init=False).eval()
+      self.assertEqual(val.tolist(), [[b"", b""]])
 
 
 if __name__ == "__main__":
-- 
GitLab


From dff3875cdca6a8cf49ee5ce4c0c970eda550157f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 16:17:45 -0700
Subject: [PATCH 078/610] Automated g4 rollback of changelist 198444757

PiperOrigin-RevId: 198637528
---
 .../compiler/jit/kernels/xla_launch_op.cc     |  2 +-
 .../compiler/jit/xla_compile_on_demand_op.cc  |  3 +-
 tensorflow/compiler/tf2xla/tf2xla.cc          |  3 +-
 tensorflow/compiler/tf2xla/xla_compiler.cc    | 71 +++++++++++++++--
 tensorflow/compiler/tf2xla/xla_compiler.h     |  7 +-
 .../compiler/tf2xla/xla_compiler_test.cc      | 78 ++++++++++++++++++-
 6 files changed, 147 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/jit/kernels/xla_launch_op.cc b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
index 27287e0f96..902fe27acd 100644
--- a/tensorflow/compiler/jit/kernels/xla_launch_op.cc
+++ b/tensorflow/compiler/jit/kernels/xla_launch_op.cc
@@ -148,7 +148,7 @@ void XlaLocalLaunchBase::Compute(OpKernelContext* ctx) {
 
   XlaCompiler::Options options;
   options.client = client;
-  options.device_type = &cache->device_type();
+  options.device_type = cache->device_type();
   options.flib_def = ctx->function_library()->GetFunctionLibraryDefinition();
   options.graph_def_version = ctx->function_library()->graph_def_version();
   options.allow_cpu_custom_calls = (platform_id_ == se::host::kHostPlatformId);
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index ab644ff5a6..b1943d3e1a 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -151,8 +151,7 @@ Status XlaCompileOnDemandOp::Compile(
   core::ScopedUnref cache_ref(cache);
 
   XlaCompiler::Options options;
-  DeviceType device_type = metadata.jit_device_type();
-  options.device_type = &device_type;
+  options.device_type = metadata.jit_device_type();
   options.client = metadata.client();
   options.flib_def =
       new FunctionLibraryDefinition(OpRegistry::Global(), FunctionDefLibrary{});
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 3a08aa8cf4..ac768b206e 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -263,8 +263,7 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph, xla::Client* client,
   // Compile the graph into an XLA computation.
   XlaCompiler::Options compiler_options;
   compiler_options.client = client;
-  DeviceType device_type(DEVICE_CPU_XLA_JIT);
-  compiler_options.device_type = &device_type;
+  compiler_options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
   compiler_options.flib_def = &graph->flib_def();
   compiler_options.graph_def_version = graph->versions().producer();
   compiler_options.allow_cpu_custom_calls = true;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index f7098917b1..2fce6166d4 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -83,12 +83,9 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
       initialization_status_(Status::OK()),
       next_step_id_(1),
-      device_(
-          new XlaCompilationDevice(SessionOptions(), *options_.device_type)),
+      device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
       device_mgr_({device_}) {
-  // We no longer need the device_type.
-  options_.device_type = nullptr;
-
+  CHECK(!options_.device_type.type_string().empty());
   if (options_.populate_resource_manager) {
     initialization_status_ =
         (*options_.populate_resource_manager)(device_->resource_manager());
@@ -659,6 +656,65 @@ Status XlaCompiler::CompileSingleOp(
   return CompileGraph(options, name, std::move(graph), args, result);
 }
 
+namespace {
+
+// Check that the ops of all non-functional nodes have been registered.
+string ValidateFunctionDef(const FunctionDef* fdef,
+                           const FunctionLibraryDefinition& flib_def) {
+  std::vector<string> invalid_ops;
+  for (const NodeDef& node : fdef->node_def()) {
+    const string& op = node.op();
+    if (op == FunctionLibraryDefinition::kGradientOp || flib_def.Find(op)) {
+      continue;
+    }
+    const OpDef* op_def;
+    if (!OpRegistry::Global()->LookUpOpDef(op, &op_def).ok()) {
+      invalid_ops.push_back(op);
+    }
+  }
+  return tensorflow::str_util::Join(invalid_ops, ", ");
+}
+
+// Check that the graph doesn't have any invalid nodes (e.g. incompatible with
+// given device_type, invalid data type, missing attributes...)
+Status ValidateGraph(const Graph* graph,
+                     const FunctionLibraryDefinition& flib_def,
+                     const DeviceType& device_type, const string& name) {
+  std::vector<string> invalid_ops;
+  for (const Node* node : graph->nodes()) {
+    if (node->type_string() == FunctionLibraryDefinition::kGradientOp) {
+      continue;
+    }
+    const FunctionDef* fdef = flib_def.Find(node->def().op());
+    if (fdef) {
+      string error_msg = ValidateFunctionDef(fdef, flib_def);
+      if (!error_msg.empty()) {
+        invalid_ops.push_back(
+            strings::StrCat(node->def().op(), ":{", error_msg, "}"));
+      }
+      continue;
+    }
+    const OpDef* op_def;
+    if (!OpRegistry::Global()->LookUpOpDef(node->def().op(), &op_def).ok()) {
+      invalid_ops.push_back(node->def().op());
+      continue;
+    }
+    TF_RETURN_IF_ERROR(ValidateNodeDef(node->def(), *op_def));
+    if (!FindKernelDef(device_type, node->def(), nullptr, nullptr).ok()) {
+      invalid_ops.push_back(node->def().op());
+    }
+  }
+  if (!invalid_ops.empty()) {
+    return errors::InvalidArgument(strings::StrCat(
+        "Detected unsupported operations when trying to compile graph ", name,
+        " on ", device_type.type_string(), ":",
+        tensorflow::str_util::Join(invalid_ops, ", ")));
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  string const& name,
                                  std::unique_ptr<Graph> graph,
@@ -681,6 +737,11 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       FunctionalizeControlFlow(flib_runtime_->GetFunctionLibraryDefinition(),
                                graph.get(), local_flib_def_.get()));
 
+  // Detect invalid nodes.
+  // FunctionalizeControlFlow may remove some nodes from the graph.
+  TF_RETURN_IF_ERROR(ValidateGraph(graph.get(), *options_.flib_def,
+                                   options_.device_type, name));
+
   xla::XlaBuilder builder(name);
   XlaContext* context = new XlaContext(
       this, &builder, options_.allow_cpu_custom_calls,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index bf496bd8bc..76f4c4c1ea 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -244,9 +245,9 @@ class XlaCompiler {
   typedef std::function<TensorShape(const TensorShape&, DataType)>
       ShapeRepresentationFn;
   struct Options {
-    // Name of the compilation device to use. Needs to be live only during
-    // XlaCompiler's constructor.
-    const DeviceType* device_type = nullptr;
+    // Name of the compilation device to use. It must be set by the caller.
+    // The default empty value is invalid.
+    DeviceType device_type = DeviceType("");
 
     xla::Client* client = nullptr;
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 55772ca324..5fbf4b952c 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -45,8 +45,6 @@ namespace tensorflow {
 
 class XlaCompilerTest : public ::testing::Test {
  protected:
-  XlaCompilerTest() : cpu_device_type_(DEVICE_CPU_XLA_JIT) {}
-
   void SetUp() override {
     client_ = xla::ClientLibrary::LocalClientOrDie();
 
@@ -58,7 +56,7 @@ class XlaCompilerTest : public ::testing::Test {
 
   XlaCompiler::Options DefaultOptions() {
     XlaCompiler::Options options;
-    options.device_type = &cpu_device_type_;
+    options.device_type = DeviceType(DEVICE_CPU_XLA_JIT);
     options.client = client_;
     options.flib_def = flib_def_.get();
     return options;
@@ -68,7 +66,6 @@ class XlaCompilerTest : public ::testing::Test {
     return compiler->local_flib_def_.get();
   }
 
-  DeviceType cpu_device_type_;
   xla::Client* client_;
   std::unique_ptr<FunctionLibraryDefinition> flib_def_;
 };
@@ -979,5 +976,78 @@ TEST_F(XlaCompilerTest, ArgRetvalShapeRepresentationFunction) {
   EXPECT_TRUE(xla::LiteralTestUtil::Equal(*expected_literal, *actual_literal));
 }
 
+// Tests a graph which has a function with an invalid op.
+TEST_F(XlaCompilerTest, FunctionWithInvalidOp) {
+  XlaCompiler compiler(DefaultOptions());
+
+  FunctionDefLibrary flib;
+  FunctionDef fn = FillFn();
+  NodeDef* node = fn.add_node_def();
+  node->set_name("Invalid");
+  node->set_op("InvalidOp"); /* unsupported op */
+  node = fn.add_node_def();
+  node->set_name("Switch");
+  node->set_op("Switch"); /* control flow node */
+  *flib.add_function() = fn;
+
+  TF_ASSERT_OK(flib_def_->AddFunctionDef(fn));
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto value = ops::Const<int32>(scope.WithOpName("value"), 1, {});
+  auto shape = ops::Const<int32>(scope.WithOpName("shape"), {5}, {1});
+  TF_ASSERT_OK(scope.graph()->AddFunctionLibrary(flib));
+
+  NodeDef def;
+  TF_ASSERT_OK(NodeDefBuilder("fill_fn", "FillFn", flib_def_.get())
+                   .Input(value.name(), 0, DT_INT32)
+                   .Input(shape.name(), 1, DT_INT32)
+                   .Finalize(&def));
+  Status status;
+  Node* fill = scope.graph()->AddNode(def, &status);
+  TF_ASSERT_OK(status);
+  TF_ASSERT_OK(scope.DoShapeInference(fill));
+  scope.graph()->AddEdge(value.node(), 0, fill, 0);
+  scope.graph()->AddEdge(shape.node(), 0, fill, 1);
+
+  auto retval = ops::_Retval(scope.WithOpName("retval"), Output(fill), 0);
+
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  std::vector<XlaCompiler::Argument> args;
+  XlaCompiler::CompilationResult result;
+  status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "fill",
+                                 std::move(graph), args, &result);
+  ASSERT_FALSE(status.ok());
+  EXPECT_TRUE(
+      str_util::StrContains(status.error_message(), "FillFn:{InvalidOp}"))
+      << status.error_message();
+}
+
+// Tests a graph which has a node with invalid data type.
+TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  NodeDef shape;
+  shape.set_name("Shape");
+  shape.set_op("Shape");
+  (*shape.mutable_attr())["T"].set_type(DT_INT32);
+  (*shape.mutable_attr())["out_type"].set_type(DT_BOOL); /* invalid type */
+  Status status;
+  Node* shape_node = graph->AddNode(shape, &status);
+  TF_ASSERT_OK(status);
+  graph->AddControlEdge(graph->source_node(), shape_node);
+
+  std::vector<XlaCompiler::Argument> args;
+  XlaCompiler::CompilationResult result;
+  XlaCompiler compiler(DefaultOptions());
+  status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "invalid_type",
+                                 std::move(graph), args, &result);
+  ASSERT_FALSE(status.ok());
+  EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                    "is not in the list of allowed values"))
+      << status.error_message();
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From c9297e34f0ceef4afd970ee117aea9110bf8ae62 Mon Sep 17 00:00:00 2001
From: Karmel Allison <karmel@google.com>
Date: Wed, 30 May 2018 16:25:00 -0700
Subject: [PATCH 079/610] Add a convenience function,
 build_supervised_input_receiver_fn_from_input_fn, that takes an Estimator
 input_fn and returns an input receiver function.

PiperOrigin-RevId: 198638593
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   |   4 +-
 tensorflow/python/BUILD                       |   1 -
 tensorflow/python/estimator/BUILD             |  20 ++++
 tensorflow/python/estimator/estimator.py      |  55 +++-------
 tensorflow/python/estimator/export/export.py  |  36 +++++++
 .../python/estimator/export/export_test.py    |  35 ++++++
 tensorflow/python/estimator/util.py           |  57 ++++++++++
 tensorflow/python/estimator/util_test.py      | 102 ++++++++++++++++++
 8 files changed, 267 insertions(+), 43 deletions(-)
 create mode 100644 tensorflow/python/estimator/util_test.py

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index aeb7ba536f..4465833f88 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -46,6 +46,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.estimator import estimator as estimator_lib
 from tensorflow.python.estimator import model_fn as model_fn_lib
+from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.estimator.export import export_output as export_output_lib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -2748,7 +2749,8 @@ class _Inputs(object):
     """
     iterator = self._dataset.make_initializable_iterator()
     # pylint: disable=protected-access
-    hook = estimator_lib._DatasetInitializerHook(iterator)
+    hook = estimator_util._DatasetInitializerHook(iterator)
+    # pylint: enable=protected-access
     self._iterator = iterator
     return hook
 
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 679ef93229..0542c2fc91 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2699,7 +2699,6 @@ py_library(
         ":util",
         ":variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/estimator:util",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 0754041f9e..9c4d58b177 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -446,7 +446,26 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/python:platform",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/data",
+    ],
+)
+
+py_test(
+    name = "util_test",
+    srcs = ["util_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["notsan"],  # b/67510291
+    deps = [
+        ":util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:training",
+        "//tensorflow/python/data",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
@@ -598,6 +617,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 331ee7490e..cfbf7e2ce5 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -32,10 +32,10 @@ from tensorflow.core.framework import summary_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session as tf_session
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator import run_config
+from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.estimator.export import export as export_helpers
 from tensorflow.python.estimator.export import export_output
 from tensorflow.python.framework import errors
@@ -964,17 +964,9 @@ class Estimator(object):
   def _get_features_from_input_fn(self, input_fn, mode):
     """Extracts the `features` from return values of `input_fn`."""
     result = self._call_input_fn(input_fn, mode)
-    input_hooks = []
-    if isinstance(result, dataset_ops.Dataset):
-      iterator = result.make_initializable_iterator()
-      input_hooks.append(_DatasetInitializerHook(iterator))
-      result = iterator.get_next()
-    if isinstance(result, (list, tuple)):
-      # Unconditionally drop the label (the second element of result).
-      result = result[0]
-
+    result, _, hooks = estimator_util.parse_input_fn_result(result)
     self._validate_features_in_predict_input(result)
-    return result, input_hooks
+    return result, hooks
 
   def _validate_features_in_predict_input(self, result):
     if not _has_dataset_or_queue_runner(result):
@@ -984,25 +976,13 @@ class Estimator(object):
 
   def _get_features_and_labels_from_input_fn(self, input_fn, mode):
     """Extracts the `features` and labels from return values of `input_fn`."""
-    input_hooks = []
     if self._distribution is not None and mode == model_fn_lib.ModeKeys.TRAIN:
       result = self._distribution.distribute_dataset(
           lambda: self._call_input_fn(input_fn, mode))
-      iterator = result.make_initializable_iterator()
-      input_hooks.append(_DatasetInitializerHook(iterator))
-      result = iterator.get_next()
     else:
       result = self._call_input_fn(input_fn, mode)
-      if isinstance(result, dataset_ops.Dataset):
-        iterator = result.make_initializable_iterator()
-        input_hooks.append(_DatasetInitializerHook(iterator))
-        result = iterator.get_next()
-    if isinstance(result, (list, tuple)):
-      if len(result) != 2:
-        raise ValueError(
-            'input_fn should return (features, labels) as a len 2 tuple.')
-      return result[0], result[1], input_hooks
-    return result, None, input_hooks
+
+    return estimator_util.parse_input_fn_result(result)
 
   def _extract_batch_length(self, preds_evaluated):
     """Extracts batch length of predictions."""
@@ -1067,9 +1047,15 @@ class Estimator(object):
       mode: ModeKeys
 
     Returns:
-      Either features or (features, labels) where features and labels are:
-        features - `Tensor` or dictionary of string feature name to `Tensor`.
-        labels - `Tensor` or dictionary of `Tensor` with labels.
+      The return value of the passed input_fn, which should be one of:
+
+        * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+            tuple (features, labels) with same constraints as below.
+        * A tuple (features, labels): Where `features` is a `Tensor` or a
+          dictionary of string feature name to `Tensor` and `labels` is a
+          `Tensor` or a dictionary of string label name to `Tensor`. Both
+          `features` and `labels` are consumed by `model_fn`. They should
+          satisfy the expectation of `model_fn` from inputs.
 
     Raises:
       ValueError: if input_fn takes invalid arguments.
@@ -1610,19 +1596,6 @@ def _has_dataset_or_queue_runner(maybe_tensor):
   # Now, check queue.
   return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
 
-
-class _DatasetInitializerHook(training.SessionRunHook):
-
-  def __init__(self, iterator):
-    self._iterator = iterator
-
-  def begin(self):
-    self._initializer = self._iterator.initializer
-
-  def after_create_session(self, session, coord):
-    del coord
-    session.run(self._initializer)
-
 VocabInfo = warm_starting_util.VocabInfo  # pylint: disable=invalid-name
 tf_export('estimator.VocabInfo', allow_multiple_exports=True)(VocabInfo)
 
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index 48ae8cd497..ff19a0a7f4 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -404,6 +404,42 @@ def build_raw_supervised_input_receiver_fn(features,
   return supervised_input_receiver_fn
 
 
+def build_supervised_input_receiver_fn_from_input_fn(input_fn, **input_fn_args):
+  """Get a function that returns a SupervisedInputReceiver matching an input_fn.
+
+  Note that this function calls the input_fn in a local graph in order to
+  extract features and labels. Placeholders are then created from those
+  features and labels in the default graph.
+
+  Args:
+    input_fn: An Estimator input_fn, which is a function that returns one of:
+
+      * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+          tuple (features, labels) with same constraints as below.
+      * A tuple (features, labels): Where `features` is a `Tensor` or a
+        dictionary of string feature name to `Tensor` and `labels` is a
+        `Tensor` or a dictionary of string label name to `Tensor`. Both
+        `features` and `labels` are consumed by `model_fn`. They should
+        satisfy the expectation of `model_fn` from inputs.
+
+    **input_fn_args: set of kwargs to be passed to the input_fn. Note that
+      these will not be checked or validated here, and any errors raised by
+      the input_fn will be thrown to the top.
+
+  Returns:
+    A function taking no arguments that, when called, returns a
+    SupervisedInputReceiver. This function can be passed in as part of the
+    input_receiver_map when exporting SavedModels from Estimator with multiple
+    modes.
+  """
+  # Wrap the input_fn call in a graph to prevent sullying the default namespace
+  with ops.Graph().as_default():
+    result = input_fn(**input_fn_args)
+    features, labels, _ = util.parse_input_fn_result(result)
+  # Placeholders are created back in the default graph.
+  return build_raw_supervised_input_receiver_fn(features, labels)
+
+
 ### Below utilities are specific to SavedModel exports.
 
 
diff --git a/tensorflow/python/estimator/export/export_test.py b/tensorflow/python/estimator/export/export_test.py
index 0af587f2a8..a7074712c2 100644
--- a/tensorflow/python/estimator/export/export_test.py
+++ b/tensorflow/python/estimator/export/export_test.py
@@ -459,6 +459,41 @@ class ExportTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       export.build_raw_supervised_input_receiver_fn(features, labels)
 
+  def test_build_supervised_input_receiver_fn_from_input_fn(self):
+    def dummy_input_fn():
+      return ({"x": constant_op.constant([[1], [1]]),
+               "y": constant_op.constant(["hello", "goodbye"])},
+              constant_op.constant([[1], [1]]))
+
+    input_receiver_fn = export.build_supervised_input_receiver_fn_from_input_fn(
+        dummy_input_fn)
+
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual(set(["x", "y"]),
+                       set(input_receiver.features.keys()))
+      self.assertIsInstance(input_receiver.labels, ops.Tensor)
+      self.assertEqual(set(["x", "y", "label"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
+  def test_build_supervised_input_receiver_fn_from_input_fn_args(self):
+    def dummy_input_fn(feature_key="x"):
+      return ({feature_key: constant_op.constant([[1], [1]]),
+               "y": constant_op.constant(["hello", "goodbye"])},
+              {"my_label": constant_op.constant([[1], [1]])})
+
+    input_receiver_fn = export.build_supervised_input_receiver_fn_from_input_fn(
+        dummy_input_fn, feature_key="z")
+
+    with ops.Graph().as_default():
+      input_receiver = input_receiver_fn()
+      self.assertEqual(set(["z", "y"]),
+                       set(input_receiver.features.keys()))
+      self.assertEqual(set(["my_label"]),
+                       set(input_receiver.labels.keys()))
+      self.assertEqual(set(["z", "y", "my_label"]),
+                       set(input_receiver.receiver_tensors.keys()))
+
   def test_build_all_signature_defs_without_receiver_alternatives(self):
     receiver_tensor = array_ops.placeholder(dtypes.string)
     output_1 = constant_op.constant([1.])
diff --git a/tensorflow/python/estimator/util.py b/tensorflow/python/estimator/util.py
index e4e1d37f74..924ca309ff 100644
--- a/tensorflow/python/estimator/util.py
+++ b/tensorflow/python/estimator/util.py
@@ -24,6 +24,7 @@ import time
 
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import training
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 
@@ -72,3 +73,59 @@ def get_timestamped_dir(dir_base):
         result_dir, attempts, MAX_DIRECTORY_CREATION_ATTEMPTS))
   raise RuntimeError('Failed to obtain a unique export directory name after '
                      '{} attempts.'.format(MAX_DIRECTORY_CREATION_ATTEMPTS))
+
+
+def parse_input_fn_result(result):
+  """Gets features, labels, and hooks from the result of an Estimator input_fn.
+
+  Args:
+    result: output of an input_fn to an estimator, which should be one of:
+
+      * A 'tf.data.Dataset' object: Outputs of `Dataset` object must be a
+          tuple (features, labels) with same constraints as below.
+      * A tuple (features, labels): Where `features` is a `Tensor` or a
+        dictionary of string feature name to `Tensor` and `labels` is a
+        `Tensor` or a dictionary of string label name to `Tensor`. Both
+        `features` and `labels` are consumed by `model_fn`. They should
+        satisfy the expectation of `model_fn` from inputs.
+
+  Returns:
+    Tuple of features, labels, and input_hooks, where features are as described
+    above, labels are as described above or None, and input_hooks are a list
+    of SessionRunHooks to be included when running.
+
+  Raises:
+    ValueError: if the result is a list or tuple of length != 2.
+  """
+  input_hooks = []
+  try:
+    # We can't just check whether this is a tf.data.Dataset instance here,
+    # as this is plausibly a PerDeviceDataset. Try treating as a dataset first.
+    iterator = result.make_initializable_iterator()
+  except AttributeError:
+    # Not a dataset or dataset-like-object. Move along.
+    pass
+  else:
+    input_hooks.append(_DatasetInitializerHook(iterator))
+    result = iterator.get_next()
+
+  if isinstance(result, (list, tuple)):
+    if len(result) != 2:
+      raise ValueError(
+          'input_fn should return (features, labels) as a len 2 tuple.')
+    return result[0], result[1], input_hooks
+  return result, None, input_hooks
+
+
+class _DatasetInitializerHook(training.SessionRunHook):
+  """Creates a SessionRunHook that initializes the passed iterator."""
+
+  def __init__(self, iterator):
+    self._iterator = iterator
+
+  def begin(self):
+    self._initializer = self._iterator.initializer
+
+  def after_create_session(self, session, coord):
+    del coord
+    session.run(self._initializer)
diff --git a/tensorflow/python/estimator/util_test.py b/tensorflow/python/estimator/util_test.py
new file mode 100644
index 0000000000..d7e0610779
--- /dev/null
+++ b/tensorflow/python/estimator/util_test.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for util.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.estimator import util
+from tensorflow.python.framework import constant_op
+from tensorflow.python.platform import test
+from tensorflow.python.training import training
+
+
+class UtilTest(test.TestCase):
+  """Tests for miscellaneous Estimator utils."""
+
+  def test_parse_input_fn_result_tuple(self):
+    def _input_fn():
+      features = constant_op.constant(np.arange(100))
+      labels = constant_op.constant(np.arange(100, 200))
+      return features, labels
+
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
+
+    with self.test_session() as sess:
+      vals = sess.run([features, labels])
+
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertAllEqual(vals[1], np.arange(100, 200))
+    self.assertEqual(hooks, [])
+
+  def test_parse_input_fn_result_dataset(self):
+    def _input_fn():
+      features = np.expand_dims(np.arange(100), 0)
+      labels = np.expand_dims(np.arange(100, 200), 0)
+      return dataset_ops.Dataset.from_tensor_slices((features, labels))
+
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
+
+    with training.MonitoredSession(hooks=hooks) as sess:
+      vals = sess.run([features, labels])
+
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertAllEqual(vals[1], np.arange(100, 200))
+    self.assertIsInstance(hooks[0], util._DatasetInitializerHook)
+
+  def test_parse_input_fn_result_features_only(self):
+    def _input_fn():
+      return constant_op.constant(np.arange(100))
+
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
+
+    with self.test_session() as sess:
+      vals = sess.run([features])
+
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertEqual(labels, None)
+    self.assertEqual(hooks, [])
+
+  def test_parse_input_fn_result_features_only_dataset(self):
+    def _input_fn():
+      features = np.expand_dims(np.arange(100), 0)
+      return dataset_ops.Dataset.from_tensor_slices(features)
+
+    features, labels, hooks = util.parse_input_fn_result(_input_fn())
+
+    with training.MonitoredSession(hooks=hooks) as sess:
+      vals = sess.run([features])
+
+    self.assertAllEqual(vals[0], np.arange(100))
+    self.assertEqual(labels, None)
+    self.assertIsInstance(hooks[0], util._DatasetInitializerHook)
+
+  def test_parse_input_fn_result_invalid(self):
+    def _input_fn():
+      features = np.expand_dims(np.arange(100), 0)
+      labels = np.expand_dims(np.arange(100, 200), 0)
+      return dataset_ops.Dataset.from_tensor_slices((features, labels, labels))
+
+    with self.assertRaisesRegexp(ValueError, 'input_fn should return'):
+      util.parse_input_fn_result(_input_fn())
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 1e007dfddd5c20f89300a2e3669f56db47e2154c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 16:27:26 -0700
Subject: [PATCH 080/610] Add SerialDeviceBatchScheduler which offers similar
 performance as the AdaptiveSharedBatchScheduler, but increased reliablility
 and stability.

ASBS assumes request latency can be minimized at a specific number of batch processing threads. Under reasonable load, this is true and ASBS performs well, but under low load latency is basically unaffected by the number of threads, and ASBS can learn a wide variety of 'optimal' values.  If load resumes suddenly, these values can give very poor latencies.  In most cases, ASBS will recover, eventually rediscovering the correct value, but we have observed other cases where the latency is so large and noisy that ASBS can't get a good signal to guide its learning and the number of threads remains stuck at the bad value.

In addition, the incremental learning nature of this algorithm means that ASBS is always exploring to some extent, which can give rise to periods of non-optimal latency. This is most significant at high utilization where the wrong number of threads can potentially overload the system.

ASBS uses latency as a proxy for keeping the tensorflow processing pipeline optimally loaded. SDBS, on the other hand, uses a direct measurement of the pipeline fullness, and adjusts its number of batch processing threads accordingly. This solves the exploration problem. SDBS solves the low load problem by not adjusting its thread count when the threads pass some idleness threshold.

PiperOrigin-RevId: 198638918
---
 tensorflow/core/kernels/batching_util/BUILD   |  21 +
 .../serial_device_batch_scheduler.h           | 548 ++++++++++++++++++
 .../serial_device_batch_scheduler_test.cc     | 394 +++++++++++++
 3 files changed, 963 insertions(+)
 create mode 100644 tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
 create mode 100644 tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc

diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index de05c647d6..e292ff200a 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -126,6 +126,27 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "serial_device_batch_scheduler",
+    hdrs = ["serial_device_batch_scheduler.h"],
+    deps = [
+        ":batch_scheduler",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "serial_device_batch_scheduler_test",
+    srcs = ["serial_device_batch_scheduler_test.cc"],
+    deps = [
+        ":fake_clock_env",
+        ":serial_device_batch_scheduler",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "basic_batch_scheduler",
     hdrs = ["basic_batch_scheduler.h"],
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
new file mode 100644
index 0000000000..518f2ff8a9
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -0,0 +1,548 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <random>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/platform/cpu_info.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace serving {
+namespace internal {
+template <typename TaskType>
+class SDBSBatch;
+
+template <typename TaskType>
+class SDBSQueue;
+}  // namespace internal
+
+// EXPERIMENTAL: API MAY BE SUBJECTED TO SUDDEN CHANGES.
+//
+// Shared batch scheduler designed for batches which are processed by a serial
+// device (e.g. GPU, TPU). When batch processing involves a mix of
+// parallelizable cpu work and non-parallelizable on-device work, overall
+// latency can be minimized by producing batches at a (load dependent) rate
+// which keeps the serial device uniformly busy.
+//
+// SerialDeviceBatchScheduler (SDBS) controls the batching rate by limiting the
+// allowed number of concurrently processed batches. Too large a limit causes
+// batches to pile up behind the serial device, adding to the overall batch
+// latency. Too small a limit underutilizes the serial device and harms latency
+// by forcing batches to wait longer to be processed. Feedback from the device
+// (i.e. avg number of batches directly pending on the device) is used to set
+// the correct limit.
+//
+// SDBS groups requests into per model batches which are processed when a batch
+// processing thread becomes available. SDBS prioritizes batches primarily by
+// age (i.e. the batch's oldest request) along with a configurable preference
+// for scheduling larger batches first.
+
+
+template <typename TaskType>
+class SerialDeviceBatchScheduler : public std::enable_shared_from_this<
+                                       SerialDeviceBatchScheduler<TaskType>> {
+ public:
+  ~SerialDeviceBatchScheduler();
+
+  struct Options {
+    // The name to use for the pool of batch threads.
+    string thread_pool_name = {"batch_threads"};
+    // Maximum number of batch processing threads.
+    int64 num_batch_threads = port::NumSchedulableCPUs();
+    // Although batch selection is primarily based on age, this parameter
+    // specifies a preference for larger batches.  A full batch will be
+    // scheduled before an older, nearly empty batch as long as the age gap is
+    // less than full_batch_scheduling_boost_micros.  The optimal value for this
+    // parameter should be of order the batch processing latency, but must be
+    // chosen carefully, as too large a value will harm tail latency.
+    int64 full_batch_scheduling_boost_micros = 0;
+    // The environment to use (typically only overridden by test code).
+    Env* env = Env::Default();
+    // Initial limit for number of batches being concurrently processed.
+    int64 initial_in_flight_batches_limit = 3;
+    // Returns the current number of batches directly waiting to be processed
+    // by the serial device (i.e. GPU, TPU).
+    std::function<int64()> get_pending_on_serial_device;
+    // Desired average number of batches directly waiting to be processed by the
+    // serial device. Small numbers of O(1) should deliver the best latency.
+    double target_pending = 2;
+    // Number of batches between potential adjustments of
+    // in_flight_batches_limit.  Larger numbers will reduce noise, but will be
+    // less responsive to sudden changes in workload.
+    int64 batches_to_average_over = 1000;
+  };
+
+  // Ownership is shared between the caller of Create() and any queues created
+  // via AddQueue().
+  static Status Create(
+      const Options& options,
+      std::shared_ptr<SerialDeviceBatchScheduler<TaskType>>* scheduler);
+
+  struct QueueOptions {
+    // Maximum size of each batch.
+    int max_batch_size = 1000;
+    // Maximum number of enqueued (i.e. non-scheduled) batches.
+    int max_enqueued_batches = 10;
+  };
+
+  using BatchProcessor = std::function<void(std::unique_ptr<Batch<TaskType>>)>;
+
+  // Adds queue (and its callback) to be managed by this scheduler.
+  Status AddQueue(const QueueOptions& options,
+                  BatchProcessor process_batch_callback,
+                  std::unique_ptr<BatchScheduler<TaskType>>* queue);
+
+  double in_flight_batches_limit() {
+    mutex_lock l(mu_);
+    return in_flight_batches_limit_;
+  }
+
+  double recent_low_traffic_ratio() {
+    mutex_lock l(mu_);
+    return recent_low_traffic_ratio_;
+  }
+
+ private:
+  // access to AddBatch(), RemoveQueue(), env().
+  friend class internal::SDBSQueue<TaskType>;
+
+  explicit SerialDeviceBatchScheduler(const Options& options);
+
+  // Continuously retrieves and processes batches.
+  void ProcessBatches();
+
+  // Notifies scheduler of non-empty batch which is eligible for processing.
+  void AddBatch(const internal::SDBSBatch<TaskType>* batch);
+
+  // Removes queue from scheduler.
+  void RemoveQueue(const internal::SDBSQueue<TaskType>* queue);
+
+  Env* env() const { return options_.env; }
+
+  const Options options_;
+
+  // Collection of batches added by AddBatch. Owned by scheduler until they are
+  // released for processing.
+  std::vector<const internal::SDBSBatch<TaskType>*> batches_ GUARDED_BY(mu_);
+
+  // Unowned queues and callbacks added by AddQueue.
+  std::unordered_map<const internal::SDBSQueue<TaskType>*, BatchProcessor>
+      queues_and_callbacks_ GUARDED_BY(mu_);
+
+  // Responsible for running the batch processing callbacks.
+  std::unique_ptr<thread::ThreadPool> batch_thread_pool_;
+
+  // Limit on number of batches which can be concurrently processed.
+  int64 in_flight_batches_limit_ GUARDED_BY(mu_);
+
+  // Number of batch processing threads.
+  int64 processing_threads_ GUARDED_BY(mu_) = 0;
+
+  // Number of batches processed since the last in_flight_batches_limit_
+  // adjustment.
+  int64 batch_count_ GUARDED_BY(mu_) = 0;
+
+  // Number of times since the last in_flight_batches_limit_ adjustment when a
+  // processing thread was available but there were no batches to process.
+  int64 no_batch_count_ GUARDED_BY(mu_) = 0;
+
+  // Sum of batches pending on the serial device since the last
+  // in_flight_batches_limit_ adjustment.
+  int64 pending_sum_ = 0;
+
+  // Sum of batch latencies since the last in_flight_batches_limit_ adjustment.
+  int64 batch_latency_sum_ = 0;
+
+  // Average period between which two consecutive batches begin processing.
+  int64 batch_period_micros_ = 0;
+
+  // Moving average tracking the fraction of recent in_flight_batches_limit_
+  // adjustments where the external traffic was not high enough to provide
+  // useful feedback for an adjustment.
+  double recent_low_traffic_ratio_ = 0;
+
+  mutex mu_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(SerialDeviceBatchScheduler);
+};
+
+//////////////////////////////////////////////////////////
+// Implementation details follow. API users need not read.
+
+namespace internal {
+// Consolidates tasks into batches, passing them off to the
+// SerialDeviceBatchScheduler for processing.
+template <typename TaskType>
+class SDBSQueue : public BatchScheduler<TaskType> {
+ public:
+  using QueueOptions =
+      typename SerialDeviceBatchScheduler<TaskType>::QueueOptions;
+
+  SDBSQueue(std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler,
+            const QueueOptions& options);
+
+  ~SDBSQueue() override;
+
+  // Adds task to current batch. Fails if the task size is larger than the batch
+  // size or if the current batch is full and this queue's number of outstanding
+  // batches is at its maximum.
+  Status Schedule(std::unique_ptr<TaskType>* task) override;
+
+  // Number of tasks waiting to be scheduled.
+  size_t NumEnqueuedTasks() const override;
+
+  // Number of size 1 tasks which could currently be scheduled without failing.
+  size_t SchedulingCapacity() const override;
+
+  // Notifies queue that a batch is about to be scheduled; the queue should not
+  // place any more tasks in this batch.
+  void ReleaseBatch(const SDBSBatch<TaskType>* batch);
+
+  size_t max_task_size() const override { return options_.max_batch_size; }
+
+ private:
+  std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler_;
+  const QueueOptions options_;
+  // Owned by scheduler_.
+  SDBSBatch<TaskType>* current_batch_ GUARDED_BY(mu_) = nullptr;
+  int64 num_enqueued_batches_ GUARDED_BY(mu_) = 0;
+  int64 num_enqueued_tasks_ GUARDED_BY(mu_) = 0;
+  mutable mutex mu_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SDBSQueue);
+};
+
+// Batch which remembers when and by whom it was created.
+template <typename TaskType>
+class SDBSBatch : public Batch<TaskType> {
+ public:
+  SDBSBatch(SDBSQueue<TaskType>* queue, int64 creation_time_micros)
+      : queue_(queue), creation_time_micros_(creation_time_micros) {}
+
+  ~SDBSBatch() override {}
+
+  SDBSQueue<TaskType>* queue() const { return queue_; }
+
+  int64 creation_time_micros() const { return creation_time_micros_; }
+
+ private:
+  SDBSQueue<TaskType>* queue_;
+  const int64 creation_time_micros_;
+  TF_DISALLOW_COPY_AND_ASSIGN(SDBSBatch);
+};
+}  // namespace internal
+
+// ---------------- SerialDeviceBatchScheduler ----------------
+
+template <typename TaskType>
+Status SerialDeviceBatchScheduler<TaskType>::Create(
+    const Options& options,
+    std::shared_ptr<SerialDeviceBatchScheduler<TaskType>>* scheduler) {
+  if (options.num_batch_threads < 1) {
+    return errors::InvalidArgument("num_batch_threads must be positive; was ",
+                                   options.num_batch_threads);
+  }
+  if (options.initial_in_flight_batches_limit < 1) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit must be positive; was ",
+        options.initial_in_flight_batches_limit);
+  }
+  if (options.initial_in_flight_batches_limit > options.num_batch_threads) {
+    return errors::InvalidArgument(
+        "initial_in_flight_batches_limit (",
+        options.initial_in_flight_batches_limit,
+        ") should not be larger than num_batch_threads (",
+        options.num_batch_threads, ")");
+  }
+  if (options.full_batch_scheduling_boost_micros < 0) {
+    return errors::InvalidArgument(
+        "full_batch_scheduling_boost_micros can't be negative; was ",
+        options.full_batch_scheduling_boost_micros);
+  }
+  if (options.batches_to_average_over < 1) {
+    return errors::InvalidArgument(
+        "batches_to_average_over should be "
+        "greater than or equal to 1; was ",
+        options.batches_to_average_over);
+  }
+  if (options.target_pending <= 0) {
+    return errors::InvalidArgument(
+        "target_pending should be larger than zero; was ",
+        options.target_pending);
+  }
+  if (!options.get_pending_on_serial_device) {
+    return errors::InvalidArgument(
+        "get_pending_on_serial_device must be "
+        "specified");
+  }
+  scheduler->reset(new SerialDeviceBatchScheduler<TaskType>(options));
+  return Status::OK();
+}
+
+template <typename TaskType>
+SerialDeviceBatchScheduler<TaskType>::SerialDeviceBatchScheduler(
+    const Options& options)
+    : options_(options),
+      in_flight_batches_limit_(options.initial_in_flight_batches_limit),
+      processing_threads_(options.initial_in_flight_batches_limit) {
+  batch_thread_pool_.reset(new thread::ThreadPool(
+      env(), options.thread_pool_name, options.num_batch_threads));
+  for (int i = 0; i < processing_threads_; i++) {
+    batch_thread_pool_->Schedule(
+        std::bind(&SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
+  }
+}
+
+template <typename TaskType>
+SerialDeviceBatchScheduler<TaskType>::~SerialDeviceBatchScheduler() {
+  // Signal processing threads to exit.
+  {
+    mutex_lock l(mu_);
+    processing_threads_ = 0;
+  }
+  // Hangs until all threads finish.
+  batch_thread_pool_.reset();
+}
+
+template <typename TaskType>
+Status SerialDeviceBatchScheduler<TaskType>::AddQueue(
+    const QueueOptions& options, BatchProcessor process_batch_callback,
+    std::unique_ptr<BatchScheduler<TaskType>>* queue) {
+  if (options.max_batch_size <= 0) {
+    return errors::InvalidArgument("max_batch_size must be positive; was ",
+                                   options.max_batch_size);
+  }
+  if (options.max_enqueued_batches <= 0) {
+    return errors::InvalidArgument(
+        "max_enqueued_batches must be positive; was ",
+        options.max_enqueued_batches);
+  }
+  internal::SDBSQueue<TaskType>* SDBS_queue_raw;
+  queue->reset(SDBS_queue_raw = new internal::SDBSQueue<TaskType>(
+                   this->shared_from_this(), options));
+  mutex_lock l(mu_);
+  queues_and_callbacks_[SDBS_queue_raw] = process_batch_callback;
+  return Status::OK();
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::AddBatch(
+    const internal::SDBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  batches_.push_back(batch);
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::RemoveQueue(
+    const internal::SDBSQueue<TaskType>* queue) {
+  mutex_lock l(mu_);
+  queues_and_callbacks_.erase(queue);
+}
+
+template <typename TaskType>
+void SerialDeviceBatchScheduler<TaskType>::ProcessBatches() {
+  const int64 kIdleThreadSleepTimeMicros = 1000;
+  const double kMaxNoBatchRatio = .1;
+  const double kLowTrafficMovingAverageFactor = .1;
+  for (;;) {
+    mu_.lock();
+    if (processing_threads_ < 1 ||
+        processing_threads_ > in_flight_batches_limit_) {
+      processing_threads_--;
+      mu_.unlock();
+      break;
+    }
+    if (batches_.empty()) {
+      no_batch_count_++;
+      int64 sleep_time = batch_period_micros_ ? batch_period_micros_
+                                              : kIdleThreadSleepTimeMicros;
+      mu_.unlock();
+      env()->SleepForMicroseconds(sleep_time);
+      continue;
+    }
+    auto best_it = batches_.begin();
+    double best_score =
+        (*best_it)->creation_time_micros() -
+        options_.full_batch_scheduling_boost_micros * (*best_it)->size() /
+            static_cast<double>((*best_it)->queue()->max_task_size());
+    for (auto it = batches_.begin() + 1; it != batches_.end(); it++) {
+      const double score =
+          (*it)->creation_time_micros() -
+          options_.full_batch_scheduling_boost_micros * (*it)->size() /
+              static_cast<double>((*it)->queue()->max_task_size());
+      if (score < best_score) {
+        best_score = score;
+        best_it = it;
+      }
+    }
+    const internal::SDBSBatch<TaskType>* batch = *best_it;
+    batches_.erase(best_it);
+    // Queue may destroy itself after ReleaseBatch is called.
+    batch->queue()->ReleaseBatch(batch);
+    auto callback = queues_and_callbacks_[batch->queue()];
+    mu_.unlock();
+    int64 start_time = env()->NowMicros();
+    callback(std::unique_ptr<Batch<TaskType>>(
+        const_cast<internal::SDBSBatch<TaskType>*>(batch)));
+    int64 end_time = env()->NowMicros();
+    mu_.lock();
+    batch_count_++;
+    batch_latency_sum_ += end_time - start_time;
+    pending_sum_ += options_.get_pending_on_serial_device();
+    if (batch_count_ == options_.batches_to_average_over) {
+      recent_low_traffic_ratio_ *= (1 - kLowTrafficMovingAverageFactor);
+      // Only adjust in_flight_batches_limit_ if external load is large enough
+      // to consistently provide batches. Otherwise we would (mistakenly) assume
+      // that the device is underutilized because in_flight_batches_limit_ is
+      // too small.
+      if (no_batch_count_ < kMaxNoBatchRatio * batch_count_) {
+        double avg_pending = pending_sum_ / static_cast<double>(batch_count_);
+        // Avg processing time / # of concurrent batches gives the avg period
+        // between which two consecutive batches begin processing. Used to set a
+        // reasonable sleep time for idle batch processing threads.
+        batch_period_micros_ =
+            batch_latency_sum_ / batch_count_ / in_flight_batches_limit_;
+        // When the processing pipeline is consistently busy, the average number
+        // of pending batches differs from in_flight_batches_limit_ by a
+        // load-dependent offset. Adjust in_flight_batches_limit_to maintain
+        // the desired target pending.
+        in_flight_batches_limit_ +=
+            std::round(options_.target_pending - avg_pending);
+        in_flight_batches_limit_ = std::max(in_flight_batches_limit_, 1LL);
+        in_flight_batches_limit_ =
+            std::min(in_flight_batches_limit_, options_.num_batch_threads);
+        // Add extra processing threads if necessary.
+        if (processing_threads_ > 0 &&
+            processing_threads_ < in_flight_batches_limit_) {
+          int extra_threads = in_flight_batches_limit_ - processing_threads_;
+          for (int i = 0; i < extra_threads; i++) {
+            batch_thread_pool_->Schedule(std::bind(
+                &SerialDeviceBatchScheduler<TaskType>::ProcessBatches, this));
+          }
+          processing_threads_ = in_flight_batches_limit_;
+        }
+      } else {
+        recent_low_traffic_ratio_ += kLowTrafficMovingAverageFactor;
+      }
+      batch_count_ = 0;
+      no_batch_count_ = 0;
+      pending_sum_ = 0;
+      batch_latency_sum_ = 0;
+    }
+    mu_.unlock();
+  }
+}
+
+// ---------------- SDBSQueue ----------------
+
+namespace internal {
+template <typename TaskType>
+SDBSQueue<TaskType>::SDBSQueue(
+    std::shared_ptr<SerialDeviceBatchScheduler<TaskType>> scheduler,
+    const QueueOptions& options)
+    : scheduler_(scheduler), options_(options) {}
+
+template <typename TaskType>
+SDBSQueue<TaskType>::~SDBSQueue() {
+  // Wait until last batch has been scheduled.
+  const int kSleepMicros = 1000;
+  for (;;) {
+    {
+      mutex_lock l(mu_);
+      if (num_enqueued_batches_ == 0) {
+        break;
+      }
+    }
+    scheduler_->env()->SleepForMicroseconds(kSleepMicros);
+  }
+  scheduler_->RemoveQueue(this);
+}
+
+template <typename TaskType>
+Status SDBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
+  SDBSBatch<TaskType>* new_batch = nullptr;
+  size_t size = (*task)->size();
+  if (size > options_.max_batch_size) {
+    return errors::InvalidArgument("Task size ", size,
+                                   " is larger than maximum batch size ",
+                                   options_.max_batch_size);
+  }
+  {
+    mutex_lock l(mu_);
+    // Current batch is full, create another if allowed.
+    if (current_batch_ &&
+        current_batch_->size() + size > options_.max_batch_size) {
+      if (num_enqueued_batches_ >= options_.max_enqueued_batches) {
+        return errors::Unavailable("The batch scheduling queue is full");
+      }
+      current_batch_->Close();
+      current_batch_ = nullptr;
+    }
+    if (!current_batch_) {
+      num_enqueued_batches_++;
+      current_batch_ = new_batch =
+          new SDBSBatch<TaskType>(this, scheduler_->env()->NowMicros());
+    }
+    current_batch_->AddTask(std::move(*task));
+    num_enqueued_tasks_++;
+  }
+  // AddBatch must be called outside of lock, since it may call ReleaseBatch.
+  if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
+  return Status::OK();
+}
+
+template <typename TaskType>
+void SDBSQueue<TaskType>::ReleaseBatch(const SDBSBatch<TaskType>* batch) {
+  mutex_lock l(mu_);
+  num_enqueued_batches_--;
+  num_enqueued_tasks_ -= batch->num_tasks();
+  if (batch == current_batch_) {
+    current_batch_->Close();
+    current_batch_ = nullptr;
+  }
+}
+
+template <typename TaskType>
+size_t SDBSQueue<TaskType>::NumEnqueuedTasks() const {
+  mutex_lock l(mu_);
+  return num_enqueued_tasks_;
+}
+
+template <typename TaskType>
+size_t SDBSQueue<TaskType>::SchedulingCapacity() const {
+  mutex_lock l(mu_);
+  const int current_batch_capacity =
+      current_batch_ ? options_.max_batch_size - current_batch_->size() : 0;
+  const int spare_batches =
+      options_.max_enqueued_batches - num_enqueued_batches_;
+  return spare_batches * options_.max_batch_size + current_batch_capacity;
+}
+}  // namespace internal
+}  // namespace serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_KERNELS_BATCHING_UTIL_SERIAL_DEVICE_BATCH_SCHEDULER_H_
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
new file mode 100644
index 0000000000..a2f8f9a03e
--- /dev/null
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler_test.cc
@@ -0,0 +1,394 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h"
+
+#include "tensorflow/core/kernels/batching_util/fake_clock_env.h"
+#include "tensorflow/core/lib/core/notification.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace serving {
+namespace anonymous {
+
+class FakeTask : public BatchTask {
+ public:
+  explicit FakeTask(size_t size) : size_(size) {}
+
+  ~FakeTask() override = default;
+
+  size_t size() const override { return size_; }
+
+ private:
+  const size_t size_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FakeTask);
+};
+
+// Creates a FakeTask of size 'task_size', and calls 'scheduler->Schedule()' on
+// that task. Returns the resulting status.
+Status ScheduleTask(size_t task_size, BatchScheduler<FakeTask>* scheduler) {
+  std::unique_ptr<FakeTask> task(new FakeTask(task_size));
+  Status status = scheduler->Schedule(&task);
+  // Schedule() should have consumed 'task' iff it returned Status::OK.
+  CHECK_EQ(status.ok(), task == nullptr);
+  return status;
+}
+
+// Creates a thread that waits on 'start' and then advances the fake clock in
+// 'env' in a loop until 'stop' is notified. Useful for allowing objects that
+// use the clock to be destroyed.
+std::unique_ptr<Thread> CreateFakeClockAdvancerThread(
+    test_util::FakeClockEnv* env, Notification* start, Notification* stop) {
+  return std::unique_ptr<Thread>(Env::Default()->StartThread(
+      {}, "FakeClockAdvancerThread", [env, start, stop] {
+        start->WaitForNotification();
+        while (!stop->HasBeenNotified()) {
+          env->AdvanceByMicroseconds(10);
+          Env::Default()->SleepForMicroseconds(10);
+        }
+      }));
+}
+
+TEST(SerialDeviceBatchSchedulerTest, BadOptions) {
+  using Scheduler = SerialDeviceBatchScheduler<FakeTask>;
+  std::shared_ptr<Scheduler> scheduler;
+  Scheduler::Options default_options;
+  default_options.get_pending_on_serial_device = []() { return 0; };
+  Scheduler::Options options = default_options;
+  options.num_batch_threads = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.initial_in_flight_batches_limit = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.num_batch_threads = 5;
+  options.initial_in_flight_batches_limit = 8;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.batches_to_average_over = -5;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = default_options;
+  options.target_pending = 0;
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+  options = Scheduler::Options();
+  EXPECT_FALSE(Scheduler::Create(options, &scheduler).ok());
+}
+
+TEST(SerialDeviceBatchSchedulerTest, InFlightBatchesLimit) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.num_batch_threads = 3;
+  options.initial_in_flight_batches_limit = 2;
+  options.batches_to_average_over = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 2) {
+      // Give third batch a chance to process if it's going to.
+      Env::Default()->SleepForMicroseconds(1000);
+      finish_processing.Notify();
+    }
+    if (batch_num == 3) {
+      ASSERT_TRUE(finish_processing.HasBeenNotified());
+    }
+    finish_processing.WaitForNotification();
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue3;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue1));
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue2));
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue3));
+  // Create 3 batches, only 2 should be processed concurrently.
+  TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue2.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue3.get()));
+}
+
+TEST(SerialDeviceBatchSchedulerTest, PendingOnSerialDevice) {
+  mutex mu;
+  int pending;
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.num_batch_threads = 3;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1;
+  options.target_pending = 3;
+  options.get_pending_on_serial_device = [&mu, &pending]() {
+    mutex_lock l(mu);
+    return pending;
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  // Make sure batch processing thread has gone to sleep.
+  Env::Default()->SleepForMicroseconds(1000);
+  int processed_batches = 0;
+  Notification start_processing;
+  auto queue_callback = [&mu, &processed_batches, &start_processing, &pending,
+                         &scheduler](std::unique_ptr<Batch<FakeTask>> batch) {
+    // Be careful with mutex mu to avoid potential deadlock with mutex mu_
+    // held in ProcessBatch() and in_flight_batches_limit().
+    int batch_num;
+    {
+      mutex_lock l(mu);
+      batch_num = ++processed_batches;
+    }
+    switch (batch_num) {
+      case 1:
+        start_processing.WaitForNotification();
+        {
+          mutex_lock l(mu);
+          pending = 2;
+        }
+        break;
+      case 2:
+        // No batches initially --> low traffic --> no adjustment.
+        CHECK_EQ(scheduler->in_flight_batches_limit(), 1);
+        {
+          mutex_lock l(mu);
+          pending = 3;
+        }
+        break;
+      case 3:
+        // Pending at target --> no adjustment.
+        CHECK_EQ(scheduler->in_flight_batches_limit(), 1);
+        {
+          mutex_lock l(mu);
+          pending = 1;
+        }
+        break;
+      case 4:
+        // Small pending --> 2 additional threads added.
+        CHECK_EQ(scheduler->in_flight_batches_limit(), 3);
+        {
+          mutex_lock l(mu);
+          pending = 3;
+        }
+        break;
+      default:
+        break;
+    }
+  };
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+  // Create 4 batches.
+  for (int i = 0; i < 4; i++) {
+    TF_ASSERT_OK(ScheduleTask(800, queue.get()));
+  }
+  start_processing.Notify();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, FullBatchSchedulingBoostMicros) {
+  test_util::FakeClockEnv env(Env::Default());
+  Notification start_teardown, stop_teardown;
+  std::unique_ptr<Thread> teardown_thread =
+      CreateFakeClockAdvancerThread(&env, &start_teardown, &stop_teardown);
+  {
+    SerialDeviceBatchScheduler<FakeTask>::Options options;
+    options.env = &env;
+    options.initial_in_flight_batches_limit = 1;
+    options.batches_to_average_over = 1000;
+    options.full_batch_scheduling_boost_micros = 10;
+    options.get_pending_on_serial_device = []() { return 0; };
+    mutex mu;
+    int processed_batches = 0;
+    auto queue_callback =
+        [&mu, &processed_batches](std::unique_ptr<Batch<FakeTask>> batch) {
+          ASSERT_TRUE(batch->IsClosed());
+          mutex_lock l(mu);
+          processed_batches++;
+          switch (processed_batches) {
+            case 1:
+              EXPECT_EQ(1000, batch->size());
+              break;
+            case 2:
+              EXPECT_EQ(100, batch->size());
+              break;
+            case 3:
+              EXPECT_EQ(80, batch->size());
+              break;
+            default:
+              EXPECT_TRUE(false) << "Should only have 3 batches";
+          }
+        };
+    std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+    TF_ASSERT_OK(
+        SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+    // Make sure batch processing thread has gone to sleep.
+    Env::Default()->SleepForMicroseconds(1000);
+    SerialDeviceBatchScheduler<FakeTask>::QueueOptions queue_options;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+    std::unique_ptr<BatchScheduler<FakeTask>> queue3;
+    queue_options.max_batch_size = 1000;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue1));
+    queue_options.max_batch_size = 1000;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue2));
+    queue_options.max_batch_size = 100;
+    TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue3));
+
+    TF_ASSERT_OK(ScheduleTask(100, queue1.get()));
+    // First batch - creation time: 0, fullness: 0.1, sched score: -1
+    env.AdvanceByMicroseconds(3);
+    TF_ASSERT_OK(ScheduleTask(1000, queue2.get()));
+    // Second batch - creation time: 3, fullness: 1, sched score: -7
+    env.AdvanceByMicroseconds(5);
+    TF_ASSERT_OK(ScheduleTask(80, queue3.get()));
+    // Third batch - creation time: 8, fullness: .8, sched score: 0
+    // Release the batch processing thread.
+    env.AdvanceByMicroseconds(1000);
+    start_teardown.Notify();
+  }
+  stop_teardown.Notify();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, DeleteQueue) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    finish_processing.WaitForNotification();
+    mu.lock();
+    processed_batches++;
+    mu.unlock();
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(800, queue.get()));
+  }
+  std::unique_ptr<Thread> queue_deleter(Env::Default()->StartThread(
+      {}, "QueueDeleterThread", [&queue, &mu, &processed_batches] {
+        // Delete queue, should be kept alive until empty.
+        queue.reset();
+        mutex_lock l(mu);
+        EXPECT_EQ(processed_batches, 2);
+      }));
+  // Give queue_deleter thread time to delete queue.
+  Env::Default()->SleepForMicroseconds(1000);
+  finish_processing.Notify();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, DeleteScheduler) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification start_processing;
+  Notification finish_processing;
+  auto queue_callback =
+      [&mu, &processed_batches, &start_processing,
+       &finish_processing](std::unique_ptr<Batch<FakeTask>> batch) {
+        ASSERT_TRUE(batch->IsClosed());
+        EXPECT_GT(batch->num_tasks(), 0);
+        start_processing.WaitForNotification();
+        mutex_lock l(mu);
+        processed_batches++;
+        if (processed_batches == 2) {
+          finish_processing.Notify();
+        }
+      };
+
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue));
+
+  // Enqueue 2 tasks, should result in 2 batches.
+  for (int i = 0; i < 2; i++) {
+    TF_ASSERT_OK(ScheduleTask(800, queue.get()));
+  }
+  // Delete scheduler, should be kept alive until queues are empty.
+  scheduler.reset();
+  start_processing.Notify();
+  finish_processing.WaitForNotification();
+}
+
+TEST(SerialDeviceBatchSchedulerTest, QueueCapacityInfo) {
+  SerialDeviceBatchScheduler<FakeTask>::Options options;
+  options.initial_in_flight_batches_limit = 1;
+  options.batches_to_average_over = 1000;
+  options.full_batch_scheduling_boost_micros = 1000;
+  options.get_pending_on_serial_device = []() { return 0; };
+  mutex mu;
+  int processed_batches = 0;
+  Notification finish_processing;
+  auto queue_callback = [&mu, &processed_batches, &finish_processing](
+                            std::unique_ptr<Batch<FakeTask>> batch) {
+    ASSERT_TRUE(batch->IsClosed());
+    EXPECT_GT(batch->num_tasks(), 0);
+    mu.lock();
+    int batch_num = ++processed_batches;
+    mu.unlock();
+    if (batch_num == 1) {
+      finish_processing.WaitForNotification();
+    }
+  };
+  std::shared_ptr<SerialDeviceBatchScheduler<FakeTask>> scheduler;
+  TF_ASSERT_OK(
+      SerialDeviceBatchScheduler<FakeTask>::Create(options, &scheduler));
+  std::unique_ptr<BatchScheduler<FakeTask>> queue1;
+  std::unique_ptr<BatchScheduler<FakeTask>> queue2;
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue1));
+  TF_ASSERT_OK(scheduler->AddQueue({}, queue_callback, &queue2));
+
+  // Blocker task, should schedule first.
+  TF_ASSERT_OK(ScheduleTask(800, queue1.get()));
+  TF_ASSERT_OK(ScheduleTask(100, queue2.get()));
+
+  EXPECT_EQ(queue2->NumEnqueuedTasks(), 1);
+  EXPECT_EQ(queue2->SchedulingCapacity(), 9 * 1000 + 900);
+  // Enqueue 2 more tasks, should fall in same batch.
+  TF_ASSERT_OK(ScheduleTask(100, queue2.get()));
+  TF_ASSERT_OK(ScheduleTask(200, queue2.get()));
+  EXPECT_EQ(queue2->NumEnqueuedTasks(), 3);
+  EXPECT_EQ(queue2->SchedulingCapacity(), 9 * 1000 + 600);
+  // Enqueue 1 more task, should create new batch.
+  TF_ASSERT_OK(ScheduleTask(700, queue2.get()));
+  EXPECT_EQ(queue2->NumEnqueuedTasks(), 4);
+  EXPECT_EQ(queue2->SchedulingCapacity(), 8 * 1000 + 300);
+  finish_processing.Notify();
+}
+}  // namespace anonymous
+}  // namespace serving
+}  // namespace tensorflow
-- 
GitLab


From 82daf99029cce7a8001fffc14b533c930e88cfa6 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 30 May 2018 16:29:25 -0700
Subject: [PATCH 081/610] Always delete old while loop after LICM

Right now the old while loop can stick around if it had side effects, which is
incorrect.

PiperOrigin-RevId: 198639203
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 tensorflow/compiler/xla/service/while_util.cc | 10 +++--
 tensorflow/compiler/xla/service/while_util.h  | 12 ++++--
 .../compiler/xla/service/while_util_test.cc   | 43 +++++++++++++++++++
 tensorflow/compiler/xla/util.h                |  7 +++
 5 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 4d653a0196..cd3d55e4f9 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2920,6 +2920,7 @@ tf_cc_test(
     deps = [
         ":while_util",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_matchers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/compiler/xla/tools/parser:hlo_parser",
diff --git a/tensorflow/compiler/xla/service/while_util.cc b/tensorflow/compiler/xla/service/while_util.cc
index ed20b36292..473eab2ea8 100644
--- a/tensorflow/compiler/xla/service/while_util.cc
+++ b/tensorflow/compiler/xla/service/while_util.cc
@@ -117,9 +117,13 @@ WhileUtil::MakeInstructionsLiveIn(
   HloInstruction* new_while = containing_computation->AddInstruction(
       HloInstruction::CreateWhile(new_while_shape, new_while_condition,
                                   new_while_body, new_while_init));
-  TF_RETURN_IF_ERROR(containing_computation->ReplaceInstruction(
-      while_instr, TupleUtil::ExtractPrefix(
-                       new_while, while_instr->shape().tuple_shapes_size())));
+
+  // We want to get rid of the old while instruction even if it has side
+  // effecting operations so we do a manual HloComputation::RemoveInstruction
+  // instead of relying on HloComputation::ReplaceInstruction.
+  TF_RETURN_IF_ERROR(while_instr->ReplaceAllUsesWith(TupleUtil::ExtractPrefix(
+      new_while, while_instr->shape().tuple_shapes_size())));
+  TF_RETURN_IF_ERROR(containing_computation->RemoveInstruction(while_instr));
 
   HloInstruction* while_body_param = new_while_body->parameter_instruction(0);
   std::vector<HloInstruction*> live_in_instructions;
diff --git a/tensorflow/compiler/xla/service/while_util.h b/tensorflow/compiler/xla/service/while_util.h
index 322d27b88c..e67636d80f 100644
--- a/tensorflow/compiler/xla/service/while_util.h
+++ b/tensorflow/compiler/xla/service/while_util.h
@@ -38,17 +38,21 @@ class WhileUtil {
   };
 
   // Replaces `while_instr` with a new while instruction that is equivalent to
-  // `while_instr`, except that it has all of the HLO instructions in
+  // `while_instr` except that it has all of the HLO instructions in
   // `instructions` as live-in, loop invariant values.  These new live in values
   // are represented as new elements appended to the parameter of the while
   // loop, which must be of tuple shape.  GetTupleElement instructions computing
   // each new live in value is returned in the `while_body_live_in_values`
   // vector.
   //
-  // Precondition: `while_instr` must have a tuple shaped state.
+  // Deletes `while_instr` after replacing it.
   //
-  // Every instruction in `instructions` must be contained in the computation
-  // that contains `while_instr`.
+  // Preconditions:
+  //
+  //  `while_instr` must have a tuple shaped state.
+  //
+  //   Every instruction in `instructions` must be contained in the computation
+  //   that contains `while_instr`.
   static StatusOr<MakeInstructionsLiveInResult> MakeInstructionsLiveIn(
       HloInstruction* while_instr,
       tensorflow::gtl::ArraySlice<HloInstruction*> instructions);
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
index 974bc542a3..bcc545c61d 100644
--- a/tensorflow/compiler/xla/service/while_util_test.cc
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 namespace {
@@ -163,5 +164,47 @@ ENTRY main {
   ASSERT_EQ(gte_list.size(), 1);
   EXPECT_EQ((*gte_list.begin())->name(), "gte.0");
 }
+
+TEST(WhileUtilTest, AlwaysRemovePreviousWhileBody) {
+  const char* const hlo_string = R"(
+HloModule WhileWithSideEffects
+
+body {
+  param.b = (s32[], s32[]) parameter(0)
+  gte.0 = s32[] get-tuple-element(param.b), index=0
+  gte.1 = s32[] get-tuple-element(param.b), index=1
+  add = s32[] add(gte.0, gte.1)
+  ROOT tuple = (s32[], s32[]) tuple(gte.0, add)
+}
+
+cond {
+  param.c = (s32[], s32[]) parameter(0)
+  ROOT condition = pred[] infeed()
+}
+
+ENTRY main {
+  init = (s32[], s32[]) parameter(0)
+  to_make_live_in = f32[100] parameter(1)
+  ROOT while = (s32[], s32[]) while(init), condition=cond, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          tools::Parse(hlo_string));
+
+  HloComputation* main = module->GetComputationWithName("main");
+  HloInstruction* while_instr = main->root_instruction();
+  HloInstruction* to_make_live_in = main->parameter_instruction(1);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      WhileUtil::MakeInstructionsLiveInResult make_live_in_result,
+      WhileUtil::MakeInstructionsLiveIn(while_instr,
+                                        /*instructions=*/{to_make_live_in}));
+
+  auto is_while = [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kWhile;
+  };
+  EXPECT_EQ(c_count_if(main->instructions(), is_while), 1);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 7303640726..b4f45cc972 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -526,6 +526,13 @@ typename std::decay<T>::type c_accumulate(const Sequence& sequence, T&& init,
                          std::forward<BinaryOp>(binary_op));
 }
 
+template <typename C, typename Pred>
+typename std::iterator_traits<
+    decltype(std::begin(std::declval<C>()))>::difference_type
+c_count_if(const C& c, Pred&& pred) {
+  return std::count_if(std::begin(c), std::end(c), std::forward<Pred>(pred));
+}
+
 template <typename C, typename Value>
 int64 FindIndex(const C& c, Value&& value) {
   auto it = c_find(c, std::forward<Value>(value));
-- 
GitLab


From a0c40500cce2ebb7bee552005bdcd3a8ab470172 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Wed, 30 May 2018 16:38:59 -0700
Subject: [PATCH 082/610] Regard a path as a directory if it ends with "/" in
 GCS. This implies the assumption that if a real GCS object has file name
 ending with "/", it is always a directory mark rather than an object carrying
 actual contents.

PiperOrigin-RevId: 198640604
---
 .../core/platform/cloud/gcs_file_system.cc    | 34 ++++++++------
 .../platform/cloud/gcs_file_system_test.cc    | 46 +++++++++++++++++++
 2 files changed, 67 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 632bb32063..5f612b5f53 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -965,11 +965,16 @@ Status GcsFileSystem::FileExists(const string& fname) {
       return Status::OK();
     }
   }
-  bool result;
-  TF_RETURN_IF_ERROR(ObjectExists(fname, bucket, object, &result));
-  if (result) {
-    return Status::OK();
+
+  // Check if the object exists.
+  GcsFileStat stat;
+  const Status status = StatForObject(fname, bucket, object, &stat);
+  if (status.code() != errors::Code::NOT_FOUND) {
+    return status;
   }
+
+  // Check if the folder exists.
+  bool result;
   TF_RETURN_IF_ERROR(FolderExists(fname, &result));
   if (result) {
     return Status::OK();
@@ -982,11 +987,11 @@ Status GcsFileSystem::ObjectExists(const string& fname, const string& bucket,
   if (!result) {
     return errors::Internal("'result' cannot be nullptr.");
   }
-  GcsFileStat not_used_stat;
-  const Status status = StatForObject(fname, bucket, object, &not_used_stat);
+  GcsFileStat stat;
+  const Status status = StatForObject(fname, bucket, object, &stat);
   switch (status.code()) {
     case errors::Code::OK:
-      *result = true;
+      *result = !stat.base.is_directory;
       return Status::OK();
     case errors::Code::NOT_FOUND:
       *result = false;
@@ -1040,7 +1045,14 @@ Status GcsFileSystem::UncachedStatForObject(const string& fname,
           << "; mtime_nsec: " << stat->base.mtime_nsec
           << "; updated: " << updated;
 
-  stat->base.is_directory = false;
+  if (str_util::EndsWith(fname, "/")) {
+    // In GCS a path can be both a directory and a file, both it is uncommon for
+    // other file systems. To avoid the ambiguity, if a path ends with "/" in
+    // GCS, we always regard it as a directory mark or a virtual directory.
+    stat->base.is_directory = true;
+  } else {
+    stat->base.is_directory = false;
+  }
   return Status::OK();
 }
 
@@ -1059,11 +1071,7 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
       [this, &bucket, &object](const string& fname, GcsFileStat* stat) {
         return UncachedStatForObject(fname, bucket, object, stat);
       }));
-  if (stat->base.is_directory) {
-    return errors::NotFound(fname, " is a directory.");
-  } else {
-    return Status::OK();
-  }
+  return Status::OK();
 }
 
 Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
diff --git a/tensorflow/core/platform/cloud/gcs_file_system_test.cc b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
index 6a28d9162f..e791ae5a19 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system_test.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system_test.cc
@@ -1137,6 +1137,28 @@ TEST(GcsFileSystemTest, FileExists_StatCache) {
   }
 }
 
+TEST(GcsFileSystemTest, FileExists_DirectoryMark) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+      "dir%2F?fields=size%2Cgeneration%2Cupdated\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
+                      "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(
+      std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+      std::unique_ptr<HttpRequest::Factory>(
+          new FakeHttpRequestFactory(&requests)),
+      0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+      3600 /* stat cache max age */, 0 /* stat cache max entries */,
+      0 /* matching paths cache max age */,
+      0 /* matching paths cache max entries */, 0 /* initial retry delay */,
+      kTestTimeoutConfig, nullptr /* gcs additional header */);
+
+  TF_EXPECT_OK(fs.FileExists("gs://bucket/dir/"));
+  TF_EXPECT_OK(fs.IsDirectory("gs://bucket/dir/"));
+}
+
 TEST(GcsFileSystemTest, GetChildren_NoItems) {
   std::vector<HttpRequest*> requests({new FakeHttpRequest(
       "Uri: https://www.googleapis.com/storage/v1/b/bucket/o?"
@@ -2407,6 +2429,30 @@ TEST(GcsFileSystemTest, Stat_Cache_Flush) {
   }
 }
 
+TEST(GcsFileSystemTest, Stat_FilenameEndingWithSlash) {
+  std::vector<HttpRequest*> requests({new FakeHttpRequest(
+      "Uri: https://www.googleapis.com/storage/v1/b/bucket/o/"
+      "dir%2F?fields=size%2Cgeneration%2Cupdated\n"
+      "Auth Token: fake_token\n"
+      "Timeouts: 5 1 10\n",
+      strings::StrCat("{\"size\": \"5\",\"generation\": \"1\","
+                      "\"updated\": \"2016-04-29T23:15:24.896Z\"}"))});
+  GcsFileSystem fs(std::unique_ptr<AuthProvider>(new FakeAuthProvider),
+                   std::unique_ptr<HttpRequest::Factory>(
+                       new FakeHttpRequestFactory(&requests)),
+                   0 /* block size */, 0 /* max bytes */, 0 /* max staleness */,
+                   0 /* stat cache max age */, 0 /* stat cache max entries */,
+                   0 /* matching paths cache max age */,
+                   0 /* matching paths cache max entries */,
+                   0 /* initial retry delay*/, kTestTimeoutConfig,
+                   nullptr /* gcs additional header */);
+
+  FileStatistics stat;
+  TF_EXPECT_OK(fs.Stat("gs://bucket/dir/", &stat));
+  EXPECT_EQ(5, stat.length);
+  EXPECT_TRUE(stat.is_directory);
+}
+
 TEST(GcsFileSystemTest, IsDirectory_NotFound) {
   std::vector<HttpRequest*> requests(
       {new FakeHttpRequest(
-- 
GitLab


From 089571430135531664dbc12344d060d3252f38fa Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 30 May 2018 16:54:00 -0700
Subject: [PATCH 083/610] [TF:XLA] Bump open source llvm revision to r333547

PiperOrigin-RevId: 198642698
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index f4b935cbfe..16c1846e17 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -453,11 +453,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/d3b4e8171138b4d39106fb3bea1b9b8d2bbd4001.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/d3b4e8171138b4d39106fb3bea1b9b8d2bbd4001.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/bf13d093f13a295d71080614c3036ada591201d5.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/bf13d093f13a295d71080614c3036ada591201d5.tar.gz",
       ],
-      sha256 = "03db53e502dd4fbdbbf1c470776315eeff665180ade32859cfb6c1e996bbf2a5",
-      strip_prefix = "llvm-d3b4e8171138b4d39106fb3bea1b9b8d2bbd4001",
+      sha256 = "3c5b4538a4df95090693bf6b758e861afc5b8c599592368f9dc57901f7560bd0",
+      strip_prefix = "llvm-bf13d093f13a295d71080614c3036ada591201d5",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From 49535c9da686ea24f4e755e90fdaaa97f9f91b9d Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 30 May 2018 17:00:50 -0700
Subject: [PATCH 084/610] [XLA] Switch replay_computation to use LocalClient.

This lets replay_computation build an executable once and run it
multiple times.  This is particularly important because in XLA:GPU, the
first run of an executable does some autotuning and therefore is
unrepresentative.

This change removes --xla_hlo_profile_last_run, because I don't see how
to support it in LocalClient -- LocalClient wants the do-profile bit to
be set when we *compile*.  (There may not be an easy fix for this; it
worked with regular Client because we were recompiling every time we
ran.)

PiperOrigin-RevId: 198643577
---
 .../compiler/xla/client/local_client.cc       |  5 ++
 tensorflow/compiler/xla/client/local_client.h |  5 ++
 .../compiler/xla/service/local_service.cc     | 11 +++
 .../compiler/xla/service/local_service.h      |  5 ++
 .../compiler/xla/tools/replay_computation.cc  | 90 ++++++++++---------
 5 files changed, 75 insertions(+), 41 deletions(-)

diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index a7c55c6b2b..f9003373a6 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -304,6 +304,11 @@ StatusOr<std::unique_ptr<Literal>> LocalClient::ShapedBufferToLiteral(
                                                                  shaped_buffer);
 }
 
+StatusOr<const ShapedBuffer*> LocalClient::GlobalDataToShapedBuffer(
+    const GlobalDataHandle& data, int replica_number) {
+  return local_service_->GlobalDataToShapedBuffer(data, replica_number);
+}
+
 Status LocalClient::TransferToInfeedLocal(const Literal& literal,
                                           int device_ordinal) {
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 3f23e52fc2..5b408cc6b2 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -136,6 +136,11 @@ class LocalClient : public Client {
   StatusOr<std::unique_ptr<Literal>> ShapedBufferToLiteral(
       const ShapedBuffer& shaped_buffer);
 
+  // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
+  // as long as the handle is valid.
+  StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+      const GlobalDataHandle& data, int replica_number);
+
   // Transfer the given literal to the infeed queue of the given device.
   // TODO(b/69670845): Remove the 'Local' from the name when LocalClient does
   // not inherit from Client and there is no possibility of confusion with
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 0fa4061738..41aef3920c 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -260,4 +260,15 @@ StatusOr<int> LocalService::ReplicaNumberToDeviceOrdinal(int replica_number) {
       /*computation_count=*/1);
 }
 
+StatusOr<const ShapedBuffer*> LocalService::GlobalDataToShapedBuffer(
+    const GlobalDataHandle& data, int replica_number) {
+  TF_ASSIGN_OR_RETURN(auto buffers, allocation_tracker_.Resolve(data));
+  if (replica_number >= buffers.size()) {
+    return InvalidArgument(
+        "replica_number %d out of range; must be less than num_replicas = %zu.",
+        replica_number, buffers.size());
+  }
+  return buffers[replica_number];
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 06567cabd6..b55f119b3e 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -70,6 +70,11 @@ class LocalService : public Service {
   // the "easy" case where a single replica is a single device.
   StatusOr<int> ReplicaNumberToDeviceOrdinal(int replica_number);
 
+  // Converts a GlobalDataHandle into a pointer to a ShapedBuffer that's valid
+  // as long as the handle is valid.
+  StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
+      const GlobalDataHandle& data, int replica_number);
+
  private:
   explicit LocalService(const ServiceOptions& options,
                         std::unique_ptr<Backend> backend);
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index fc7e8002c7..be094b7890 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -68,7 +68,6 @@ struct Options {
   bool use_fake_data = false;
   bool print_result = true;
   int num_runs = 1;
-  bool xla_hlo_profile_last_run = false;
 };
 
 // Invokes the given computation passing arbitrary data for every (unbound)
@@ -80,21 +79,35 @@ struct Options {
 //
 // If neither generate_fake_infeed is true nor a fake_infeed_shape is provided,
 // no infeed is performed.
-StatusOr<std::unique_ptr<Literal>> ReplayComputation(const HloSnapshot& module,
-                                                     Client* client,
-                                                     const Options& opts) {
+StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
+                                    LocalClient* client, const Options& opts) {
   XlaComputation computation(module.hlo().hlo_module());
 
-  std::vector<std::unique_ptr<GlobalData>> arguments;
+  // Build the `argument_ptrs` vector, which contains ShapedBuffer*s to our
+  // arguments.  This is a bit involved, because we may have to convert from
+  // GlobalData to ShapedBuffer*, and we have to manage the lifetime of all our
+  // objects.
+  std::vector<ScopedShapedBuffer> scoped_shaped_buffer_arguments;
+  std::vector<std::unique_ptr<GlobalData>> global_data_arguments;
+  std::vector<const ShapedBuffer*> argument_ptrs;
   if (opts.use_fake_data) {
-    arguments = MakeFakeArgumentsOrDie(computation, client);
+    global_data_arguments = MakeFakeArgumentsOrDie(computation, client);
+    for (const auto& data : global_data_arguments) {
+      argument_ptrs.push_back(
+          client->GlobalDataToShapedBuffer(data->handle(), /*device_ordinal=*/0)
+              .ValueOrDie());
+    }
   } else {  // use recorded data if available
     for (const auto& proto : module.arguments()) {
       TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::Literal> literal,
                           Literal::CreateFromProto(proto));
-      TF_ASSIGN_OR_RETURN(std::unique_ptr<GlobalData> data,
-                          client->TransferToServer(*literal));
-      arguments.push_back(std::move(data));
+      TF_ASSIGN_OR_RETURN(
+          ScopedShapedBuffer data,
+          client->LiteralToShapedBuffer(*literal, /*device_ordinal=*/0));
+      scoped_shaped_buffer_arguments.push_back(std::move(data));
+    }
+    for (const auto& argument : scoped_shaped_buffer_arguments) {
+      argument_ptrs.push_back(&argument);
     }
   }
 
@@ -149,43 +162,41 @@ StatusOr<std::unique_ptr<Literal>> ReplayComputation(const HloSnapshot& module,
     });
   }
 
-  std::vector<GlobalData*> execute_arguments;
-  execute_arguments.reserve(arguments.size());
-  for (auto& argument : arguments) {
-    execute_arguments.push_back(argument.get());
+  std::vector<const Shape*> argument_layouts;
+  for (const auto& param : computation.proto().program_shape().parameters()) {
+    argument_layouts.push_back(&param);
   }
+  std::unique_ptr<LocalExecutable> executable =
+      client->Compile(computation, argument_layouts, ExecutableBuildOptions())
+          .ValueOrDie();
 
   // Run the computation num_runs times, and return the result from the last
   // execution.
-  std::unique_ptr<Literal> result;
+  StreamExecutorMemoryAllocator allocator(
+      client->platform(),
+      {client->platform()->ExecutorForDevice(0).ValueOrDie()});
+  tensorflow::gtl::optional<ScopedShapedBuffer> result;
   for (int i = 0; i < opts.num_runs; ++i) {
     ExecutionProfile profile;
-    ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-    if (opts.xla_hlo_profile_last_run && i == opts.num_runs - 1) {
-      execution_options.mutable_debug_options()->set_xla_hlo_profile(true);
-    }
+    ExecutableRunOptions run_options;
+    run_options.set_execution_profile(&profile);
+    run_options.set_allocator(&allocator);
 
-    if (opts.print_result) {
-      TF_ASSIGN_OR_RETURN(
-          result, client->ExecuteAndTransfer(computation, execute_arguments,
-                                             &execution_options, &profile));
-    } else {
-      // If we're not printing the result, execute the computation but don't
-      // bother retrieving the result.  This can be a significant speedup.
-      TF_RETURN_IF_ERROR(client
-                             ->Execute(computation, execute_arguments,
-                                       &execution_options, &profile)
-                             .status());
-    }
+    TF_ASSIGN_OR_RETURN(result, executable->Run(argument_ptrs, run_options));
     LOG(INFO) << "Execution took "
               << static_cast<double>(profile.compute_time_ns()) / 1e9 << "s";
   }
 
-  return std::move(result);
+  // Check that --num_runs > 0, otherwise *result below will fail with an
+  // unhelpful error (because the loop didn't run any iterations).
+  CHECK_GT(opts.num_runs, 0) << "--num_runs must be > 0";
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> result_literal,
+                      client->ShapedBufferToLiteral(*result));
+  return std::move(*result_literal);
 }
 
 int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
-  Client* client = ClientLibrary::LocalClientOrDie();
+  LocalClient* client = ClientLibrary::LocalClientOrDie();
   tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
   for (char* arg : args) {
@@ -202,8 +213,8 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
       CHECK(opts.use_fake_data)
           << "HloProto input must be handled with --use_fake_data";
     }
-    StatusOr<std::unique_ptr<Literal>> result_status =
-        ReplayComputation(snapshot, client, opts);
+
+    StatusOr<Literal> result_status = ReplayComputation(snapshot, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
               result_status.status().ToString().c_str());
@@ -211,12 +222,12 @@ int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
       continue;
     }
 
-    std::unique_ptr<Literal> result = result_status.ConsumeValueOrDie();
-    if (result != nullptr) {
+    if (opts.print_result) {
+      Literal result = std::move(result_status).ValueOrDie();
       fprintf(stdout, "%s: %s :: %s:%s\n", arg,
               snapshot.hlo().hlo_module().name().c_str(),
-              ShapeUtil::HumanString(result->shape()).c_str(),
-              result->ToString().c_str());
+              ShapeUtil::HumanString(result.shape()).c_str(),
+              result.ToString().c_str());
       if (snapshot.has_result()) {
         std::unique_ptr<Literal> literal =
             Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
@@ -249,9 +260,6 @@ int main(int argc, char** argv) {
       tensorflow::Flag("generate_fake_infeed", &opts.generate_fake_infeed,
                        "Whether a fake infeed shape should be generated "
                        "derived from the computation"),
-      tensorflow::Flag(
-          "xla_hlo_profile_last_run", &opts.xla_hlo_profile_last_run,
-          "Pass --xla_hlo_profile the last time we run the computation."),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   bool parse_ok = tensorflow::Flags::Parse(&argc, argv, flag_list);
-- 
GitLab


From 2a484497062677f5cf0205ee3b9c28a64f03fe04 Mon Sep 17 00:00:00 2001
From: Chris Ying <chrisying@google.com>
Date: Wed, 30 May 2018 17:38:13 -0700
Subject: [PATCH 085/610] Fix bug with renorm + virtual_batch_size.

PiperOrigin-RevId: 198648273
---
 .../python/keras/layers/normalization.py      | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index c0dc5220f1..7743d00c0f 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -574,28 +574,26 @@ class BatchNormalization(Layer):
                                      lambda: variance,
                                      lambda: moving_variance)
 
+      if self.virtual_batch_size is not None:
+        # This isn't strictly correct since in ghost batch norm, you are
+        # supposed to sequentially update the moving_mean and moving_variance
+        # with each sub-batch. However, since the moving statistics are only
+        # used during evaluation, it is more efficient to just update in one
+        # step and should not make a significant difference in the result.
+        new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
+        new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True)
+      else:
+        new_mean, new_variance = mean, variance
+
       if self.renorm:
         r, d, new_mean, new_variance = self._renorm_correction_and_moments(
-            mean, variance, training)
+            new_mean, new_variance, training)
         # When training, the normalized values (say, x) will be transformed as
         # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
         # = x * (r * gamma) + (d * gamma + beta) with renorm.
         r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
         d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
         scale, offset = _compose_transforms(r, d, scale, offset)
-      else:
-        new_mean, new_variance = mean, variance
-
-      if self.virtual_batch_size is not None:
-        # This isn't strictly correct since in ghost batch norm, you are
-        # supposed to sequentially update the moving_mean and moving_variance
-        # with each sub-batch. However, since the moving statistics are only
-        # used during evaluation, it is more efficient to just update in one
-        # step and should not make a significant difference in the result.
-        new_mean = math_ops.reduce_mean(new_mean,
-                                        axis=1, keepdims=True)
-        new_variance = math_ops.reduce_mean(new_variance,
-                                            axis=1, keepdims=True)
 
       def _do_update(var, value):
         if in_eager_mode and not self.trainable:
-- 
GitLab


From 316549d36f6ab3d250ce9e33b768bbfb1a4d7362 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Wed, 30 May 2018 17:54:02 -0700
Subject: [PATCH 086/610] Enable TOCO pip command line binding.

PiperOrigin-RevId: 198649827
---
 tensorflow/contrib/lite/python/BUILD          |  19 +-
 .../lite/python/convert_saved_model.py        | 118 ++++----
 .../lite/python/convert_saved_model_test.py   |  55 +++-
 tensorflow/contrib/lite/python/lite.py        | 187 +++++++++---
 tensorflow/contrib/lite/python/lite_test.py   | 180 +++++++++++-
 .../contrib/lite/python/tflite_convert.py     | 273 ++++++++++++++++++
 .../contrib/lite/toco/g3doc/python_api.md     |  49 ++--
 tensorflow/contrib/lite/toco/python/BUILD     |   6 -
 .../contrib/lite/toco/python/toco_wrapper.py  |  40 ---
 tensorflow/tools/pip_package/BUILD            |   4 +-
 .../tools/pip_package/build_pip_package.sh    |   4 +-
 tensorflow/tools/pip_package/setup.py         |   3 +-
 12 files changed, 749 insertions(+), 189 deletions(-)
 create mode 100644 tensorflow/contrib/lite/python/tflite_convert.py
 delete mode 100644 tensorflow/contrib/lite/toco/python/toco_wrapper.py

diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index a40e512045..7e6ff6c0a8 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -36,6 +36,16 @@ py_test(
     ],
 )
 
+py_binary(
+    name = "tflite_convert",
+    srcs = ["tflite_convert.py"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lite",
+    ],
+)
+
 py_library(
     name = "lite",
     srcs = ["lite.py"],
@@ -125,6 +135,7 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":convert",
         "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python:graph_util",
         "//tensorflow/python:platform",
@@ -164,11 +175,3 @@ py_test(
         "//tensorflow/python/saved_model",
     ],
 )
-
-# Transitive dependencies of this target will be included in the pip package.
-py_library(
-    name = "tf_lite_py_pip",
-    deps = [
-        ":convert_saved_model",
-    ],
-)
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index 54fec9d61f..b952a72aab 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -18,31 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.contrib.lite.python.convert import tensor_name
 from tensorflow.contrib.saved_model.python.saved_model import reader
 from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.framework import ops
-from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import loader
-from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import tag_constants
-
-
-def _write_and_flush_file(file_path, data_str):
-  """Writes data to file path.
-
-  Args:
-    file_path: Full path of the file to store data in.
-    data_str: Data represented as a string.
-
-  Returns: None.
-  """
-  with gfile.Open(file_path, "wb") as data_file:
-    data_file.write(data_str)
-    data_file.flush()
 
 
 def _log_tensor_details(tensor_info):
@@ -167,29 +151,10 @@ def _get_tensors(graph, signature_def_tensor_names=None,
   """
   tensors = []
   if user_tensor_names:
-    # Get the list of all of the tensors with and without the tensor index.
-    all_tensor_names = [
-        tensor.name for op in graph.get_operations() for tensor in op.outputs
-    ]
-    all_tensor_names_only = [name.split(":")[0] for name in all_tensor_names]
-
     # Sort the tensor names.
     user_tensor_names = sorted(user_tensor_names)
 
-    # Get the tensors associated with the tensor names.
-    tensors = []
-    invalid_tensors = []
-    for name in user_tensor_names:
-      if name not in all_tensor_names_only:
-        invalid_tensors.append(name)
-      else:
-        idx = all_tensor_names_only.index(name)
-        tensors.append(graph.get_tensor_by_name(all_tensor_names[idx]))
-
-    # Throw ValueError if any user input names are not valid tensors.
-    if invalid_tensors:
-      raise ValueError("Invalid tensors '{}' were found.".format(
-          ",".join(invalid_tensors)))
+    tensors = get_tensors_from_tensor_names(graph, user_tensor_names)
   elif signature_def_tensor_names:
     tensors = [
         graph.get_tensor_by_name(name)
@@ -204,6 +169,58 @@ def _get_tensors(graph, signature_def_tensor_names=None,
   return tensors
 
 
+def get_tensors_from_tensor_names(graph, tensor_names):
+  """Gets the Tensors associated with the `tensor_names` in the provided graph.
+
+  Args:
+    graph: TensorFlow Graph.
+    tensor_names: List of strings that represent names of tensors in the graph.
+
+  Returns:
+    A list of Tensor objects in the same order the names are provided.
+
+  Raises:
+    ValueError:
+      tensor_names contains an invalid tensor name.
+  """
+  # Get the list of all of the tensors.
+  tensor_name_to_tensor = {
+      tensor_name(tensor): tensor for op in graph.get_operations()
+      for tensor in op.values()
+  }
+
+  # Get the tensors associated with tensor_names.
+  tensors = []
+  invalid_tensors = []
+  for name in tensor_names:
+    tensor = tensor_name_to_tensor.get(name)
+    if tensor is None:
+      invalid_tensors.append(name)
+    else:
+      tensors.append(tensor)
+
+  # Throw ValueError if any user input names are not valid tensors.
+  if invalid_tensors:
+    raise ValueError("Invalid tensors '{}' were found.".format(
+        ",".join(invalid_tensors)))
+  return tensors
+
+
+def set_tensor_shapes(tensors, shapes):
+  """Sets Tensor shape for each tensor if the shape is defined.
+
+  Args:
+    tensors: TensorFlow ops.Tensor.
+    shapes: Dict of strings representing input tensor names to list of
+      integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+  """
+  if shapes:
+    for tensor in tensors:
+      shape = shapes.get(tensor.name)
+      if shape is not None:
+        tensor.set_shape(shapes[tensor.name])
+
+
 def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
                        output_arrays, tag_set, signature_key):
   """Converts a SavedModel to a frozen graph.
@@ -211,15 +228,14 @@ def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
   Args:
     saved_model_dir: SavedModel directory to convert.
     input_arrays: List of input tensors to freeze graph with. Uses input arrays
-      from SignatureDef when none are provided. (default None)
-    input_shapes: Map of strings representing input tensor names to list of
+      from SignatureDef when none are provided.
+    input_shapes: Dict of strings representing input tensor names to list of
       integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
       Automatically determined when input shapes is None (e.g., {"foo" : None}).
-      (default None)
     output_arrays: List of output tensors to freeze graph with. Uses output
-      arrays from SignatureDef when none are provided. (default None)
+      arrays from SignatureDef when none are provided.
     tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
-      analyze. All tags in the tag set must be present. (default "serve")
+      analyze. All tags in the tag set must be present.
     signature_key: Key identifying SignatureDef containing inputs and outputs.
 
   Returns:
@@ -233,14 +249,7 @@ def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
       signature_key is not in the MetaGraphDef.
       input_shapes does not match the length of input_arrays.
       input_arrays or output_arrays are not valid.
-      Unable to load Session.
   """
-  # Set default values for inputs if they are set to None.
-  if signature_key is None:
-    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-  if tag_set is None:
-    tag_set = set([tag_constants.SERVING])
-
   # Read SignatureDef.
   meta_graph = _get_meta_graph_def(saved_model_dir, tag_set)
   signature_def = _get_signature_def(meta_graph, signature_key)
@@ -255,19 +264,10 @@ def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
     # TODO(zhixianyan): Use TFLite supported Op list to filter outputs.
     in_tensors = _get_tensors(graph, inputs, input_arrays)
     out_tensors = _get_tensors(graph, outputs, output_arrays)
-
-    # Gets fully defined tensor shape.
-    for tensor in in_tensors:
-      if (input_shapes and tensor.name in input_shapes and
-          input_shapes[tensor.name] is not None):
-        shape = input_shapes[tensor.name]
-      else:
-        shape = tensor.get_shape().as_list()
-      tensor.set_shape(shape)
+    set_tensor_shapes(in_tensors, input_shapes)
 
     output_names = [node.split(":")[0] for node in outputs]
     frozen_graph_def = tf_graph_util.convert_variables_to_constants(
         sess, graph.as_graph_def(), output_names)
 
     return frozen_graph_def, in_tensors, out_tensors
-  raise ValueError("Unable to load Session.")
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index f69381d0e6..80e5dc6e46 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -41,9 +41,58 @@ from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import training as train
 
 
+class TensorFunctionsTest(test_util.TensorFlowTestCase):
+
+  def testGetTensorsValid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    tensors = convert_saved_model.get_tensors_from_tensor_names(
+        sess.graph, ["Placeholder"])
+    self.assertEqual("Placeholder:0", tensors[0].name)
+
+  def testGetTensorsInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.get_tensors_from_tensor_names(sess.graph,
+                                                        ["invalid-input"])
+    self.assertEqual("Invalid tensors 'invalid-input' were found.",
+                     str(error.exception))
+
+  def testSetTensorShapeValid(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor],
+                                          {"Placeholder:0": [5, 3, 5]})
+    self.assertEqual([5, 3, 5], tensor.shape.as_list())
+
+  def testSetTensorShapeInvalid(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor],
+                                          {"invalid-input": [5, 3, 5]})
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+  def testSetTensorShapeEmpty(self):
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    convert_saved_model.set_tensor_shapes([tensor], {})
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+
 class FreezeSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSimpleSavedModel(self, shape):
@@ -93,6 +142,10 @@ class FreezeSavedModelTest(test_util.TensorFlowTestCase):
                          output_arrays=None,
                          tag_set=None,
                          signature_key=None):
+    if tag_set is None:
+      tag_set = set([tag_constants.SERVING])
+    if signature_key is None:
+      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     graph_def, in_tensors, out_tensors = convert_saved_model.freeze_saved_model(
         saved_model_dir=saved_model_dir,
         input_arrays=input_arrays,
@@ -390,7 +443,7 @@ class FreezeSavedModelTestTrainGraph(test_util.TensorFlowTestCase):
         input_arrays=None,
         input_shapes=None,
         output_arrays=["Softmax"],
-        tag_set=None,
+        tag_set=set([tag_constants.SERVING]),
         signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY)
 
     self.assertTrue(result)
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index f7f2d40a02..6510d74177 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -33,15 +33,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from google.protobuf import text_format as _text_format
+from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
 from tensorflow.contrib.lite.python.convert import tensor_name
 from tensorflow.contrib.lite.python.convert import toco_convert
 from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.convert_saved_model import freeze_saved_model
+from tensorflow.contrib.lite.python.convert_saved_model import get_tensors_from_tensor_names
+from tensorflow.contrib.lite.python.convert_saved_model import set_tensor_shapes
 from tensorflow.contrib.lite.python.interpreter import Interpreter  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import convert_op_hints_to_stubs  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.op_hint import OpHint  # pylint: disable=unused-import
+from tensorflow.core.framework import graph_pb2 as _graph_pb2
+from tensorflow.python.client import session as _session
 from tensorflow.python.framework import graph_util as tf_graph_util
+from tensorflow.python.framework.importer import import_graph_def
 from tensorflow.python.ops.variables import global_variables_initializer
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import tag_constants
@@ -55,13 +62,15 @@ class TocoConverter(object):
 
   Attributes:
 
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-      (default FLOAT)
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT). (default TFLITE)
+    inference_type: Target data type of arrays in the output file. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+    output_format: Output file format. Currently must be `{TFLITE,
+      GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: The mean and std deviation of training data for each
       input tensor. Only needed if `inference_type` is `QUANTIZED_UINT8`.
-      (default None)
+      Dict of strings representing input tensor names to a tuple of integers
+      representing the quantization stats (e.g., {"foo" : (0., 1.)}).
+      (default {})
     drop_control_dependency: Boolean indicating whether to drop control
       dependencies silently. This is due to TFLite not supporting control
       dependencies. (default True)
@@ -70,11 +79,17 @@ class TocoConverter(object):
 
   Example usage:
 
-    # Converting a frozen graph.
+    # Converting a GraphDef from session.
     converter = lite.TocoConverter.from_session(sess, in_tensors, out_tensors)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
 
+    # Converting a GraphDef from file.
+    converter = lite.TocoConverter.from_flatbuffer_file(
+      graph_def_file, input_arrays, output_arrays)
+    tflite_model = converter.convert()
+    open("converted_model.tflite", "wb").write(tflite_model)
+
     # Converting a SavedModel.
     converter = lite.TocoConverter.from_saved_model(saved_model_dir)
     tflite_model = converter.convert()
@@ -95,16 +110,12 @@ class TocoConverter(object):
     self._output_tensors = output_tensors
     self.inference_type = constants.FLOAT
     self.output_format = constants.TFLITE
-    self.quantized_input_stats = None
+    self.quantized_input_stats = {}
     self.drop_control_dependency = True
     self.allow_custom_ops = False
 
   @classmethod
-  def from_session(cls,
-                   sess,
-                   input_tensors,
-                   output_tensors,
-                   freeze_variables=False):
+  def from_session(cls, sess, input_tensors, output_tensors):
     """Creates a TocoConverter class from a TensorFlow Session.
 
     Args:
@@ -112,56 +123,102 @@ class TocoConverter(object):
       input_tensors: List of input tensors. Type and shape are computed using
         `foo.get_shape()` and `foo.dtype`.
       output_tensors: List of output tensors (only .name is used from this).
-      freeze_variables: Boolean indicating whether the variables need to be
-        converted into constants via the freeze_graph.py script.
-        (default False)
 
     Returns:
       TocoConverter class.
     """
+    graph_def = _freeze_graph(sess, output_tensors)
+    return cls(graph_def, input_tensors, output_tensors)
+
+  @classmethod
+  def from_flatbuffer_file(cls,
+                           graph_def_file,
+                           input_arrays,
+                           output_arrays,
+                           input_shapes=None):
+    """Creates a TocoConverter class from a file containing a GraphDef.
+
+    Args:
+      graph_def_file: Full filepath of file containing TensorFlow GraphDef.
+      input_arrays: List of input tensors to freeze graph with.
+      output_arrays: List of output tensors to freeze graph with.
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
+        Automatically determined when input shapes is None (e.g., {"foo" :
+        None}). (default None)
 
-    # Get GraphDef.
-    if freeze_variables:
+    Returns:
+      TocoConverter class.
+
+    Raises:
+      ValueError:
+        Unable to parse input file.
+        The graph is not frozen.
+        input_arrays or output_arrays contains an invalid tensor name.
+    """
+    with _session.Session() as sess:
       sess.run(global_variables_initializer())
-      output_arrays = [tensor_name(tensor) for tensor in output_tensors]
-      graph_def = tf_graph_util.convert_variables_to_constants(
-          sess, sess.graph_def, output_arrays)
-    else:
-      graph_def = sess.graph_def
 
-    # Create TocoConverter class.
-    return cls(graph_def, input_tensors, output_tensors)
+      # Read GraphDef from file.
+      graph_def = _graph_pb2.GraphDef()
+      with open(graph_def_file, "rb") as f:
+        file_content = f.read()
+      try:
+        graph_def.ParseFromString(file_content)
+      except (_text_format.ParseError, DecodeError):
+        try:
+          print("Ignore 'tcmalloc: large alloc' warnings.")
+          _text_format.Merge(file_content, graph_def)
+        except (_text_format.ParseError, DecodeError):
+          raise ValueError(
+              "Unable to parse input file '{}'.".format(graph_def_file))
+      sess.graph.as_default()
+      import_graph_def(graph_def, name="")
+
+      # Get input and output tensors.
+      input_tensors = get_tensors_from_tensor_names(sess.graph, input_arrays)
+      output_tensors = get_tensors_from_tensor_names(sess.graph, output_arrays)
+      set_tensor_shapes(input_tensors, input_shapes)
+
+      # Check if graph is frozen.
+      if not _is_frozen_graph(sess):
+        raise ValueError("Please freeze the graph using freeze_graph.py")
+
+      # Create TocoConverter class.
+      return cls(sess.graph_def, input_tensors, output_tensors)
 
   @classmethod
-  def from_saved_model(
-      cls,
-      saved_model_dir,
-      input_arrays=None,
-      input_shapes=None,
-      output_arrays=None,
-      tag_set=None,
-      signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+  def from_saved_model(cls,
+                       saved_model_dir,
+                       input_arrays=None,
+                       input_shapes=None,
+                       output_arrays=None,
+                       tag_set=None,
+                       signature_key=None):
     """Creates a TocoConverter class from a SavedModel.
 
     Args:
       saved_model_dir: SavedModel directory to convert.
       input_arrays: List of input tensors to freeze graph with. Uses input
         arrays from SignatureDef when none are provided. (default None)
-      input_shapes: Map of strings representing input tensor names to list of
-        integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+      input_shapes: Dict of strings representing input tensor names to list of
+        integers representing input shapes (e.g., {"foo" : [1, 16, 16, 3]}).
         Automatically determined when input shapes is None (e.g., {"foo" :
         None}). (default None)
       output_arrays: List of output tensors to freeze graph with. Uses output
         arrays from SignatureDef when none are provided. (default None)
       tag_set: Set of tags identifying the MetaGraphDef within the SavedModel to
-        analyze. All tags in the tag set must be present. (default "serve")
+        analyze. All tags in the tag set must be present. (default set("serve"))
       signature_key: Key identifying SignatureDef containing inputs and outputs.
+        (default DEFAULT_SERVING_SIGNATURE_DEF_KEY)
 
     Returns:
       TocoConverter class.
     """
     if tag_set is None:
       tag_set = set([tag_constants.SERVING])
+    if signature_key is None:
+      signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
 
     result = freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
                                 output_arrays, tag_set, signature_key)
@@ -189,6 +246,24 @@ class TocoConverter(object):
       elif shape[0] is None:
         self._set_batch_size(batch_size=1)
 
+    # Get quantization stats. Ensures there is one stat per name if the stats
+    # are specified.
+    if self.quantized_input_stats:
+      quantized_stats = []
+      invalid_stats = []
+      for tensor in self._input_tensors:
+        name = tensor_name(tensor)
+        if name in self.quantized_input_stats:
+          quantized_stats.append(self.quantized_input_stats[name])
+        else:
+          invalid_stats.append(name)
+
+      if invalid_stats:
+        raise ValueError("Quantization input stats are not available for input "
+                         "tensors '{0}'.".format(",".join(invalid_stats)))
+    else:
+      quantized_stats = None
+
     # Converts model.
     result = toco_convert(
         input_data=self._graph_def,
@@ -197,7 +272,7 @@ class TocoConverter(object):
         inference_type=self.inference_type,
         input_format=constants.TENSORFLOW_GRAPHDEF,
         output_format=self.output_format,
-        quantized_input_stats=self.quantized_input_stats,
+        quantized_input_stats=quantized_stats,
         drop_control_dependency=self.drop_control_dependency)
     return result
 
@@ -212,3 +287,43 @@ class TocoConverter(object):
       shape = tensor.get_shape().as_list()
       shape[0] = batch_size
       tensor.set_shape(shape)
+
+
+def _is_frozen_graph(sess):
+  """Determines if the graph is frozen.
+
+  Determines if a graph has previously been frozen by checking for any
+  operations of type Variable*. If variables are found, the graph is not frozen.
+
+  Args:
+    sess: TensorFlow Session.
+
+  Returns:
+    Bool.
+  """
+  for op in sess.graph.get_operations():
+    if op.type.startswith("Variable"):
+      return False
+  return True
+
+
+def _freeze_graph(sess, output_tensors):
+  """Returns a frozen GraphDef.
+
+  Freezes a graph with Variables in it. Otherwise the existing GraphDef is
+  returned.
+
+  Args:
+    sess: TensorFlow Session.
+    output_tensors: List of output tensors (only .name is used from this).
+
+  Returns:
+    Frozen GraphDef.
+  """
+  if not _is_frozen_graph(sess):
+    sess.run(global_variables_initializer())
+    output_arrays = [tensor_name(tensor) for tensor in output_tensors]
+    return tf_graph_util.convert_variables_to_constants(sess, sess.graph_def,
+                                                        output_arrays)
+  else:
+    return sess.graph_def
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 2f3105f3e6..28386ecb1a 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -29,8 +29,10 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import saved_model
+from tensorflow.python.training.training_util import write_graph
 
 
 class FromSessionTest(test_util.TensorFlowTestCase):
@@ -65,16 +67,22 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
   def testQuantization(self):
-    in_tensor = array_ops.placeholder(
-        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='input')
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
     out_tensor = array_ops.fake_quant_with_min_max_args(
-        in_tensor + in_tensor, min=0., max=1., name='output')
+        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
     sess = session.Session()
 
     # Convert model and ensure model is not None.
-    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    converter = lite.TocoConverter.from_session(
+        sess, [in_tensor_1, in_tensor_2], [out_tensor])
     converter.inference_type = lite_constants.QUANTIZED_UINT8
-    converter.quantized_input_stats = [(0., 1.)]  # mean, std_dev
+    converter.quantized_input_stats = {
+        'inputA': (0., 1.),
+        'inputB': (0., 1.)
+    }  # mean, std_dev
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -83,13 +91,19 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
     input_details = interpreter.get_input_details()
-    self.assertEqual(1, len(input_details))
-    self.assertEqual('input', input_details[0]['name'])
+    self.assertEqual(2, len(input_details))
+    self.assertEqual('inputA', input_details[0]['name'])
     self.assertEqual(np.uint8, input_details[0]['dtype'])
     self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
     self.assertEqual((1., 0.),
                      input_details[0]['quantization'])  # scale, zero_point
 
+    self.assertEqual('inputB', input_details[1]['name'])
+    self.assertEqual(np.uint8, input_details[1]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[1]['shape']).all())
+    self.assertEqual((1., 0.),
+                     input_details[1]['quantization'])  # scale, zero_point
+
     output_details = interpreter.get_output_details()
     self.assertEqual(1, len(output_details))
     self.assertEqual('output', output_details[0]['name'])
@@ -97,6 +111,26 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
+  def testQuantizationInvalid(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name='inputB')
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name='output')
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(
+        sess, [in_tensor_1, in_tensor_2], [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'inputA': (0., 1.)}  # mean, std_dev
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual(
+        'Quantization input stats are not available for input tensors '
+        '\'inputB\'.', str(error.exception))
+
   def testBatchSizeInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[None, 16, 16, 3], dtype=dtypes.float32)
@@ -152,8 +186,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     sess = session.Session()
 
     # Convert model and ensure model is not None.
-    converter = lite.TocoConverter.from_session(
-        sess, [in_tensor], [out_tensor], freeze_variables=True)
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -188,6 +221,135 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(graphviz_output)
 
 
+class FromFlatbufferFile(test_util.TensorFlowTestCase):
+
+  def testFloat(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_flatbuffer_file(
+        graph_def_file, ['Placeholder'], ['add'])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testFloatWithShapesArray(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_flatbuffer_file(
+        graph_def_file, ['Placeholder'], ['add'],
+        input_shapes={'Placeholder': [1, 16, 16, 3]})
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+
+  def testFreezeGraph(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    var = variable_scope.get_variable(
+        'weights', shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + var
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pb')
+    write_graph(sess.graph_def, '', graph_def_file, False)
+
+    # Ensure the graph with variables cannot be converted.
+    with self.assertRaises(ValueError) as error:
+      lite.TocoConverter.from_flatbuffer_file(graph_def_file, ['Placeholder'],
+                                              ['add'])
+    self.assertEqual('Please freeze the graph using freeze_graph.py',
+                     str(error.exception))
+
+  def testPbtxt(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    _ = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Write graph to file.
+    graph_def_file = os.path.join(self.get_temp_dir(), 'model.pbtxt')
+    write_graph(sess.graph_def, '', graph_def_file, True)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_flatbuffer_file(
+        graph_def_file, ['Placeholder'], ['add'])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
+  def testInvalidFile(self):
+    graph_def_file = os.path.join(self.get_temp_dir(), 'invalid_file')
+    with gfile.Open(graph_def_file, 'wb') as temp_file:
+      temp_file.write('bad data')
+      temp_file.flush()
+
+    # Attempts to convert the invalid model.
+    with self.assertRaises(ValueError) as error:
+      lite.TocoConverter.from_flatbuffer_file(graph_def_file, ['Placeholder'],
+                                              ['add'])
+    self.assertEqual(
+        'Unable to parse input file \'{}\'.'.format(graph_def_file),
+        str(error.exception))
+
+
 class FromSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSavedModel(self, shape):
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
new file mode 100644
index 0000000000..79be5cdc56
--- /dev/null
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -0,0 +1,273 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Python command line interface for running TOCO."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+from tensorflow.contrib.lite.python import lite
+from tensorflow.contrib.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.contrib.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.platform import app
+
+
+def _parse_array(values):
+  if values:
+    return values.split(",")
+
+
+def _parse_int_array(values):
+  if values:
+    return [int(val) for val in values.split(",")]
+
+
+def _parse_set(values):
+  if values:
+    return set(values.split(","))
+
+
+def _get_toco_converter(flags):
+  """Makes a TocoConverter object based on the flags provided.
+
+  Args:
+    flags: argparse.Namespace object containing TFLite flags.
+
+  Returns:
+    TocoConverter object.
+  """
+  # Parse input and output arrays.
+  input_arrays = _parse_array(flags.input_arrays)
+  input_shapes = None
+  if flags.input_shapes:
+    input_shapes_list = [
+        _parse_int_array(shape) for shape in flags.input_shapes.split(":")
+    ]
+    input_shapes = dict(zip(input_arrays, input_shapes_list))
+  output_arrays = _parse_array(flags.output_arrays)
+
+  converter_kwargs = {
+      "input_arrays": input_arrays,
+      "input_shapes": input_shapes,
+      "output_arrays": output_arrays
+  }
+
+  # Create TocoConverter.
+  if flags.graph_def_file:
+    converter_fn = lite.TocoConverter.from_flatbuffer_file
+    converter_kwargs["graph_def_file"] = flags.graph_def_file
+  elif flags.saved_model_dir:
+    converter_fn = lite.TocoConverter.from_saved_model
+    converter_kwargs["saved_model_dir"] = flags.saved_model_dir
+    converter_kwargs["tag_set"] = _parse_set(flags.saved_model_tag_set)
+    converter_kwargs["signature_key"] = flags.saved_model_signature_key
+
+  return converter_fn(**converter_kwargs)
+
+
+def _convert_model(flags):
+  """Calls function to convert the TensorFlow model into a TFLite model.
+
+  Args:
+    flags: argparse.Namespace object.
+  """
+  # Create converter.
+  converter = _get_toco_converter(flags)
+  if flags.inference_type:
+    converter.inference_type = _types_pb2.IODataType.Value(flags.inference_type)
+  if flags.output_format:
+    converter.output_format = _toco_flags_pb2.FileFormat.Value(
+        flags.output_format)
+
+  if flags.mean_values and flags.std_dev_values:
+    input_arrays = _parse_array(flags.input_arrays)
+    std_dev_values = _parse_int_array(flags.std_dev_values)
+    mean_values = _parse_int_array(flags.mean_values)
+    quant_stats = zip(mean_values, std_dev_values)
+    converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
+
+  if flags.drop_control_dependency:
+    converter.drop_control_dependency = flags.drop_control_dependency
+  if flags.allow_custom_ops:
+    converter.allow_custom_ops = flags.allow_custom_ops
+
+  # Convert model.
+  output_data = converter.convert()
+  with open(flags.output_file, "wb") as f:
+    f.write(output_data)
+
+
+def _check_flags(flags, unparsed):
+  """Checks the parsed and unparsed flags to ensure they are valid.
+
+  Displays warnings for unparsed flags. Raises an error for parsed flags that
+  don't meet the required conditions.
+
+  Args:
+    flags: argparse.Namespace object containing TFLite flags.
+    unparsed: List of unparsed flags.
+
+  Raises:
+    ValueError: Invalid flags.
+  """
+  # Check unparsed flags for common mistakes based on previous TOCO.
+  if unparsed:
+    print("tflite_convert: warning: Unable to parse following flags "
+          "'{}'".format(",".join(unparsed)))
+    for flag in unparsed:
+      if "--input_file=" in flag:
+        print("tflite_convert: warning: Use --graph_def_file instead of "
+              "--input_file")
+      if "--std_values=" in flag:
+        print("tflite_convert: warning: Use --std_dev_values instead of "
+              "--std_values")
+
+  # Check that flags are valid.
+  if flags.graph_def_file and (not flags.input_arrays or
+                               not flags.output_arrays):
+    raise ValueError("--input_arrays and --output_arrays are required with "
+                     "--graph_def_file")
+
+  if flags.input_shapes:
+    if not flags.input_arrays:
+      raise ValueError("--input_shapes must be used with --input_arrays")
+    if flags.input_shapes.count(":") != flags.input_arrays.count(","):
+      raise ValueError("--input_shapes and --input_arrays must have the same "
+                       "number of items")
+
+  if flags.std_dev_values or flags.mean_values:
+    if bool(flags.std_dev_values) != bool(flags.mean_values):
+      raise ValueError("--std_dev_values and --mean_values must be used "
+                       "together")
+    if not flags.input_arrays:
+      raise ValueError("--std_dev_values and --mean_values must be used with "
+                       "--input_arrays")
+    if (flags.std_dev_values.count(",") != flags.mean_values.count(",") or
+        flags.std_dev_values.count(",") != flags.input_arrays.count(",")):
+      raise ValueError("--std_dev_values, --mean_values, and --input_arrays "
+                       "must have the same number of items")
+
+
+def run_main(_):
+  """Main in toco_convert.py."""
+  parser = argparse.ArgumentParser(
+      description=("Command line tool to run TensorFlow Lite Optimizing "
+                   "Converter (TOCO)."))
+
+  # Output file flag.
+  parser.add_argument(
+      "--output_file",
+      type=str,
+      help="Full filepath of the output file.",
+      required=True)
+
+  # Input file flags.
+  input_file_group = parser.add_mutually_exclusive_group(required=True)
+  input_file_group.add_argument(
+      "--graph_def_file",
+      type=str,
+      help="Full filepath of file containing TensorFlow GraphDef.")
+  input_file_group.add_argument(
+      "--saved_model_dir",
+      type=str,
+      help="Full filepath of directory containing the SavedModel.")
+
+  # Model format flags.
+  parser.add_argument(
+      "--output_format",
+      type=str,
+      choices=["TFLITE", "GRAPHVIZ_DOT"],
+      help="Output file format.")
+  parser.add_argument(
+      "--inference_type",
+      type=str,
+      choices=["FLOAT", "QUANTIZED_UINT8"],
+      help="Target data type of arrays in the output file.")
+
+  # Input and output arrays flags.
+  parser.add_argument(
+      "--input_arrays",
+      type=str,
+      help="Names of the output arrays, comma-separated.")
+  parser.add_argument(
+      "--input_shapes",
+      type=str,
+      help="Shapes corresponding to --input_arrays, colon-separated.")
+  parser.add_argument(
+      "--output_arrays",
+      type=str,
+      help="Names of the output arrays, comma-separated.")
+
+  # SavedModel related flags.
+  parser.add_argument(
+      "--saved_model_tag_set",
+      type=str,
+      help=("Set of tags identifying the MetaGraphDef within the SavedModel "
+            "to analyze. All tags must be present. (default \"serve\")"))
+  parser.add_argument(
+      "--saved_model_signature_key",
+      type=str,
+      help=("Key identifying SignatureDef containing inputs and outputs. "
+            "(default DEFAULT_SERVING_SIGNATURE_DEF_KEY)"))
+
+  # Quantization flags.
+  parser.add_argument(
+      "--std_dev_values",
+      type=str,
+      help=("Standard deviation of training data for each input tensor, "
+            "comma-separated. Used for quantization. (default None)"))
+  parser.add_argument(
+      "--mean_values",
+      type=str,
+      help=("Mean of training data for each input tensor, comma-separated. "
+            "Used for quantization. (default None)"))
+
+  # Graph manipulation flags.
+  parser.add_argument(
+      "--drop_control_dependency",
+      type=bool,
+      help=("Boolean indicating whether to drop control dependencies silently. "
+            "This is due to TensorFlow Lite not supporting control "
+            "dependencies. (default True)"))
+  parser.add_argument(
+      "--allow_custom_ops",
+      type=bool,
+      help=("Boolean indicating whether to allow custom operations. When false "
+            "any unknown operation is an error. When true, custom ops are "
+            "created for any op that is unknown. The developer will need to "
+            "provide these to the TensorFlow Lite runtime with a custom "
+            "resolver. (default False)"))
+
+  tflite_flags, unparsed = parser.parse_known_args(args=sys.argv[1:])
+  try:
+    _check_flags(tflite_flags, unparsed)
+  except ValueError as e:
+    parser.print_usage()
+    file_name = os.path.basename(sys.argv[0])
+    sys.stderr.write("{0}: error: {1}\n".format(file_name, str(e)))
+    sys.exit(1)
+  _convert_model(tflite_flags)
+
+
+def main():
+  app.run(main=run_main, argv=sys.argv[:1])
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index 29a83bd26f..e5f6a0b500 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -12,8 +12,8 @@ Table of contents:
 *   [High-level overview](#high-level-overview)
 *   [API](#api)
 *   [Basic examples](#basic)
-    *   [Exporting a GraphDef with constants](#basic-graphdef-const)
-    *   [Exporting a GraphDef with variables](#basic-graphdef-var)
+    *   [Exporting a GraphDef from tf.Session](#basic-graphdef-sess)
+    *   [Exporting a GraphDef from file](#basic-graphdef-file)
     *   [Exporting a SavedModel](#basic-savedmodel)
 *   [Complex examples](#complex)
     *   [Exporting a quantized GraphDef](#complex-quant)
@@ -50,17 +50,17 @@ possible.
 The following section shows examples of how to convert a basic float-point model
 from each of the supported data formats into a TensorFlow Lite FlatBuffers.
 
-### Exporting a GraphDef with constants <a name="basic-graphdef-const"></a>
+### Exporting a GraphDef from tf.Session <a name="basic-graphdef-sess"></a>
 
-The following example shows how to convert a TensorFlow GraphDef with constants
-into a TensorFlow Lite FlatBuffer.
+The following example shows how to convert a TensorFlow GraphDef into a
+TensorFlow Lite FlatBuffer from a `tf.Session` object.
 
 ```python
 import tensorflow as tf
 
 img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-const = tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-val = img + const
+var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
+val = img + var
 out = tf.identity(val, name="out")
 
 with tf.Session() as sess:
@@ -69,25 +69,28 @@ with tf.Session() as sess:
   open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
-### Exporting a GraphDef with variables <a name="basic-graphdef-var"></a>
+### Exporting a GraphDef from file <a name="basic-graphdef-file"></a>
 
-If a model has variables, they need to be turned into constants through a
-process known as freezing. It can be accomplished by setting `freeze_variables`
-to `True` as shown in the example below.
+The following example shows how to convert a TensorFlow GraphDef into a
+TensorFlow Lite FlatBuffer when the GraphDef is stored in a file. Both `.pb` and
+`.pbtxt` files are accepted.
+
+The example uses
+[Mobilenet_1.0_224](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz).
+The function only supports GraphDefs frozen via
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py).
 
 ```python
 import tensorflow as tf
 
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-var = tf.get_variable("weights", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + var
-out = tf.identity(val, name="out")
+graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
+input_arrays = ["input"]
+output_arrays = ["MobilenetV1/Predictions/Softmax"]
 
-with tf.Session() as sess:
-  converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out],
-                                                        freeze_variables=True)
-  tflite_model = converter.convert()
-  open("converted_model.tflite", "wb").write(tflite_model)
+converter = tf.contrib.lite.TocoConverter.from_flatbuffer_file(
+  graph_def_file, input_arrays, output_arrays)
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
 ```
 
 ### Exporting a SavedModel <a name="basic-savedmodel"></a>
@@ -111,8 +114,8 @@ available by running `help(tf.contrib.lite.TocoConverter)`.
 ## Complex examples <a name="complex"></a>
 
 For models where the default value of the attributes is not sufficient, the
-variables values should be set before calling `convert()`. In order to call any
-constants use `tf.contrib.lite.constants.<CONSTANT_NAME>` as seen below with
+attribute's values should be set before calling `convert()`. In order to call
+any constants use `tf.contrib.lite.constants.<CONSTANT_NAME>` as seen below with
 `QUANTIZED_UINT8`. Run `help(tf.contrib.lite.TocoConverter)` in the Python
 terminal for detailed documentation on the attributes.
 
@@ -135,7 +138,7 @@ out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
 with tf.Session() as sess:
   converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
   converter.inference_type = tf.contrib.lite.constants.QUANTIZED_UINT8
-  converter.quantized_input_stats = [(0., 1.)]  # mean, std_dev
+  converter.quantized_input_stats = {"img" : (0., 1.)}  # mean, std_dev
   tflite_model = converter.convert()
   open("converted_model.tflite", "wb").write(tflite_model)
 ```
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index 8cac568bd7..a954f1d6ba 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -41,12 +41,6 @@ py_binary(
     ],
 )
 
-py_binary(
-    name = "toco_wrapper",
-    srcs = ["toco_wrapper.py"],
-    srcs_version = "PY2AND3",
-)
-
 tf_py_test(
     name = "toco_from_protos_test",
     srcs = ["toco_from_protos_test.py"],
diff --git a/tensorflow/contrib/lite/toco/python/toco_wrapper.py b/tensorflow/contrib/lite/toco/python/toco_wrapper.py
deleted file mode 100644
index 6d6b500d7e..0000000000
--- a/tensorflow/contrib/lite/toco/python/toco_wrapper.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Wrapper for runninmg toco binary embedded in pip site-package.
-
-NOTE: this mainly exists since PIP setup.py cannot install binaries to bin/.
-It can only install Python "console-scripts." This will work as a console
-script. See tools/pip_package/setup.py (search for CONSOLE_SCRIPTS).
-"""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import sys
-
-
-def main():
-  # Pip installs the binary in aux-bin off of main site-package install.
-  # Just find it and exec, passing all arguments in the process.
-  # TODO(aselle): it is unfortunate to use all of tensorflow to lookup binary.
-  print("""TOCO from pip install is currently not working on command line.
-Please use the python TOCO API or use
-bazel run tensorflow/contrib/lite:toco -- <args> from a TensorFlow source dir.
-""")
-  sys.exit(1)
-  # TODO(aselle): Replace this when we find a way to run toco without
-  # blowing up executable size.
-  # binary = os.path.join(tf.__path__[0], 'aux-bin/toco')
-  # os.execvp(binary, sys.argv)
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 677ea65edd..e113565f45 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -173,9 +173,7 @@ sh_binary(
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
             "//tensorflow/contrib/lite/python:interpreter_test_data",
-            "//tensorflow/contrib/lite/python:tf_lite_py_pip",
-            "//tensorflow/contrib/lite/toco:toco",
-            "//tensorflow/contrib/lite/toco/python:toco_wrapper",
+            "//tensorflow/contrib/lite/python:tflite_convert",
             "//tensorflow/contrib/lite/toco/python:toco_from_protos",
         ],
     }) + if_mkl(["//third_party/mkl:intel_binary_blob"]) + if_tensorrt([
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 1a83c6e757..0c4065bc77 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -148,9 +148,7 @@ function main() {
     fi
     mkdir "${TMPDIR}/tensorflow/aux-bin"
     # Install toco as a binary in aux-bin.
-    # TODO(aselle): Re-enable this when we find a way to do it without doubling
-    # the whl size (over the limit).
-    # cp bazel-bin/tensorflow/contrib/lite/toco/toco ${TMPDIR}/tensorflow/aux-bin/
+    cp bazel-bin/tensorflow/contrib/lite/python/tflite_convert ${TMPDIR}/tensorflow/aux-bin/
   fi
 
   # protobuf pip package doesn't ship with header files. Copy the headers
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 70e6662763..d25a9e77b1 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -95,7 +95,8 @@ if sys.version_info < (3, 4):
 CONSOLE_SCRIPTS = [
     'freeze_graph = tensorflow.python.tools.freeze_graph:run_main',
     'toco_from_protos = tensorflow.contrib.lite.toco.python.toco_from_protos:main',
-    'toco = tensorflow.contrib.lite.toco.python.toco_wrapper:main',
+    'tflite_convert = tensorflow.contrib.lite.python.tflite_convert:main',
+    'toco = tensorflow.contrib.lite.python.tflite_convert:main',
     'saved_model_cli = tensorflow.python.tools.saved_model_cli:main',
     # We need to keep the TensorBoard command, even though the console script
     # is now declared by the tensorboard pip package. If we remove the
-- 
GitLab


From c86a47448534b135cdba106b59aee2492889ff75 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Wed, 30 May 2018 17:59:50 -0700
Subject: [PATCH 087/610] [XLA] Add parsers for Window and
 ConvolutionDimensionNumbers.

Also modify relevant ToString functions so we can have the property
Parse(ToString(x)) == x.

PiperOrigin-RevId: 198650340
---
 .../compiler/xla/service/hlo_instruction.cc   | 79 +++++++------------
 .../compiler/xla/service/hlo_instruction.h    |  6 +-
 tensorflow/compiler/xla/tools/parser/BUILD    |  1 +
 .../compiler/xla/tools/parser/hlo_parser.cc   | 63 ++++++++++++---
 .../compiler/xla/tools/parser/hlo_parser.h    | 11 ++-
 .../xla/tools/parser/hlo_parser_test.cc       | 21 +++++
 6 files changed, 117 insertions(+), 64 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index dc351e9968..c55e5cf793 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -2299,7 +2299,9 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   }
 
   if (convolution_dimension_numbers_ != nullptr) {
-    extra.push_back(ConvolutionDimensionNumbersToString());
+    extra.push_back(StrCat(
+        "dim_labels=",
+        ConvolutionDimensionNumbersToString(*convolution_dimension_numbers_)));
   }
   if (dot_dimension_numbers_ != nullptr) {
     extra.push_back(DotDimensionNumbersToString());
@@ -3419,42 +3421,8 @@ string RandomDistributionToString(const RandomDistribution& distribution) {
   return tensorflow::str_util::Lowercase(RandomDistribution_Name(distribution));
 }
 
-StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
-  static std::unordered_map<string, RandomDistribution>* map = [] {
-    static auto* map = new std::unordered_map<string, RandomDistribution>;
-    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
-      if (RandomDistribution_IsValid(i)) {
-        auto value = static_cast<RandomDistribution>(i);
-        (*map)[RandomDistributionToString(value)] = value;
-      }
-    }
-    return map;
-  }();
-  auto found = map->find(tensorflow::str_util::Lowercase(name));
-  if (found == map->end()) {
-    return InvalidArgument("Unknown distribution");
-  }
-  return found->second;
-}
-
-std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
-  return os << ToString(kind);
-}
-
-string HloInstruction::ConvolutionDimensionNumbersToString() const {
-  string result;
-  if (convolution_dimension_numbers_ == nullptr) {
-    return result;
-  }
-  const ConvolutionDimensionNumbers& dnums = *convolution_dimension_numbers_;
-  // Show the given dimension labels in order of major to minor based on the
-  // shape's layout.
-  const auto append_dims = [&](const std::vector<string>& dims,
-                               const Shape& shape) {
-    CHECK_EQ(dims.size(), ShapeUtil::Rank(shape));
-    StrAppend(&result, Join(dims, ""));
-  };
-
+string ConvolutionDimensionNumbersToString(
+    const ConvolutionDimensionNumbers& dnums) {
   // lhs_dims[i] is the symbol of the logical dimension i for the lhs
   // operand. E.g. if batch has dimension number 2, then lhs_dims[2] == "b".
   std::vector<string> lhs_dims(2 + dnums.input_spatial_dimensions().size());
@@ -3478,19 +3446,8 @@ string HloInstruction::ConvolutionDimensionNumbersToString() const {
     output_dims[dnums.output_spatial_dimensions(i)] = StrCat(i);
   }
 
-  result += "dim_labels=";
-  append_dims(lhs_dims, operand(0)->shape());
-  result += "_";
-  append_dims(rhs_dims, operand(1)->shape());
-  result += "->";
-
-  // A convolution can be represented as a kConvolution HLO or as a CustomCall
-  // that returns a tuple, the first element of which is the result of the
-  // convolution.
-  Shape this_shape =
-      ShapeUtil::IsTuple(shape()) ? shape().tuple_shapes(0) : shape();
-  append_dims(output_dims, this_shape);
-  return result;
+  return StrCat(Join(lhs_dims, ""), "_", Join(rhs_dims, ""), "->",
+                Join(output_dims, ""));
 }
 
 string HloInstruction::DotDimensionNumbersToString() const {
@@ -3516,6 +3473,28 @@ string HloInstruction::DotDimensionNumbersToString() const {
   return Join(result, ", ");
 }
 
+StatusOr<RandomDistribution> StringToRandomDistribution(const string& name) {
+  static std::unordered_map<string, RandomDistribution>* map = [] {
+    static auto* map = new std::unordered_map<string, RandomDistribution>;
+    for (int i = 0; i < RandomDistribution_ARRAYSIZE; i++) {
+      if (RandomDistribution_IsValid(i)) {
+        auto value = static_cast<RandomDistribution>(i);
+        (*map)[RandomDistributionToString(value)] = value;
+      }
+    }
+    return map;
+  }();
+  auto found = map->find(tensorflow::str_util::Lowercase(name));
+  if (found == map->end()) {
+    return InvalidArgument("Unknown distribution");
+  }
+  return found->second;
+}
+
+std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind) {
+  return os << ToString(kind);
+}
+
 string HloInstruction::GatherDimensionNumbersToString() const {
   CHECK_NE(gather_dimension_numbers_.get(), nullptr);
   string output_window_dims =
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 6df97c40ba..8119c35066 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1313,9 +1313,6 @@ class HloInstruction {
     return fft_length_;
   }
 
-  // Returns the dump string of the convolution dimension numbers.
-  string ConvolutionDimensionNumbersToString() const;
-
   // Returns data on the dimension numbers used for a dot operation.
   const DotDimensionNumbers& dot_dimension_numbers() const {
     CHECK(dot_dimension_numbers_ != nullptr);
@@ -1749,6 +1746,9 @@ StatusOr<HloInstruction::FusionKind> StringToFusionKind(
 string PaddingConfigToString(const PaddingConfig& padding);
 string OpMetadataToString(const OpMetadata& metadata);
 string RandomDistributionToString(const RandomDistribution& distribution);
+string ConvolutionDimensionNumbersToString(
+    const ConvolutionDimensionNumbers& dnums);
+
 StatusOr<RandomDistribution> StringToRandomDistribution(const string& name);
 
 std::ostream& operator<<(std::ostream& os, HloInstruction::FusionKind kind);
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
index 0fa4b98d0a..76f35afd53 100644
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ b/tensorflow/compiler/xla/tools/parser/BUILD
@@ -65,6 +65,7 @@ tf_cc_test(
     srcs = ["hlo_parser_test.cc"],
     deps = [
         ":hlo_parser",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 134978d21f..3c1d63ab86 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -56,10 +56,10 @@ class HloParser {
   // Returns the error information.
   string GetError() const { return Join(error_, "\n"); }
 
-  // Stand alone parsing for sharding. The parser string is supposed to
-  // contain the body of the sharding, i.e. just the rhs of the "sharding={...}"
-  // attribute string.
+  // Stand alone parsing utils for various aggregate data types.
   StatusOr<HloSharding> ParseShardingOnly();
+  StatusOr<Window> ParseWindowOnly();
+  StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbersOnly();
 
  private:
   // ParseXXX returns false if an error occurred.
@@ -169,7 +169,9 @@ class HloParser {
   bool ParseComputationName(HloComputation** value);
   // Parses a list of names and finds the corresponding hlo instructions.
   bool ParseInstructionNames(std::vector<HloInstruction*>* instructions);
-  bool ParseWindow(Window* window);
+  // Pass expect_outer_curlies == true when parsing a Window in the context of a
+  // larger computation.  Pass false when parsing a stand-alone Window string.
+  bool ParseWindow(Window* window, bool expect_outer_curlies);
   bool ParseConvolutionDimensionNumbers(ConvolutionDimensionNumbers* dnums);
   bool ParsePaddingConfig(PaddingConfig* padding);
   bool ParseMetadata(OpMetadata* metadata);
@@ -1933,7 +1935,7 @@ bool HloParser::ParseAttributeHelper(
       }
       case AttrTy::kWindow: {
         Window result;
-        if (!ParseWindow(&result)) {
+        if (!ParseWindow(&result, /*expect_outer_curlies=*/true)) {
           return false;
         }
         static_cast<optional<Window>*>(attr_out_ptr)->emplace(result);
@@ -2051,9 +2053,10 @@ bool HloParser::ParseComputationName(HloComputation** value) {
 // ::= '{' size stride? pad? lhs_dilate? rhs_dilate? '}'
 // The subattributes can appear in any order. 'size=' is required, others are
 // optional.
-bool HloParser::ParseWindow(Window* window) {
+bool HloParser::ParseWindow(Window* window, bool expect_outer_curlies) {
   LocTy loc = lexer_.GetLoc();
-  if (!ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
+  if (expect_outer_curlies &&
+      !ParseToken(TokKind::kLbrace, "expected '{' to start window attribute")) {
     return false;
   }
 
@@ -2063,7 +2066,9 @@ bool HloParser::ParseWindow(Window* window) {
   std::vector<int64> lhs_dilate;
   std::vector<int64> rhs_dilate;
   std::vector<int64> rhs_reversal;
-  while (lexer_.GetKind() != TokKind::kRbrace) {
+  const auto end_token =
+      expect_outer_curlies ? TokKind::kRbrace : TokKind::kEof;
+  while (lexer_.GetKind() != end_token) {
     LocTy attr_loc = lexer_.GetLoc();
     string field_name;
     if (!ParseAttributeName(&field_name)) {
@@ -2127,7 +2132,8 @@ bool HloParser::ParseWindow(Window* window) {
     window->mutable_dimensions(i)->set_window_reversal(
         rhs_reversal.empty() ? false : (rhs_reversal[i] == 1));
   }
-  return ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
+  return !expect_outer_curlies ||
+         ParseToken(TokKind::kRbrace, "expected '}' to end window attribute");
 }
 
 // This is the inverse of HloInstruction::ConvolutionDimensionNumbersToString.
@@ -2692,6 +2698,32 @@ StatusOr<HloSharding> HloParser::ParseShardingOnly() {
   return HloSharding::FromProto(op_sharding);
 }
 
+StatusOr<Window> HloParser::ParseWindowOnly() {
+  lexer_.Lex();
+  Window window;
+  if (!ParseWindow(&window, /*expect_outer_curlies=*/false)) {
+    return InvalidArgument("Syntax error:\n%s", GetError().c_str());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument("Syntax error:\nExtra content after window");
+  }
+  return window;
+}
+
+StatusOr<ConvolutionDimensionNumbers>
+HloParser::ParseConvolutionDimensionNumbersOnly() {
+  lexer_.Lex();
+  ConvolutionDimensionNumbers dnums;
+  if (!ParseConvolutionDimensionNumbers(&dnums)) {
+    return InvalidArgument("Syntax error:\n%s", GetError().c_str());
+  }
+  if (lexer_.GetKind() != TokKind::kEof) {
+    return InvalidArgument(
+        "Syntax error:\nExtra content after convolution dnums");
+  }
+  return dnums;
+}
+
 }  // namespace
 
 StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
@@ -2714,5 +2746,18 @@ StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str) {
   return parser.ParseShardingOnly();
 }
 
+StatusOr<Window> ParseWindow(tensorflow::StringPiece str) {
+  HloModuleConfig config;
+  HloParser parser(str, config);
+  return parser.ParseWindowOnly();
+}
+
+StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+    tensorflow::StringPiece str) {
+  HloModuleConfig config;
+  HloParser parser(str, config);
+  return parser.ParseConvolutionDimensionNumbersOnly();
+}
+
 }  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.h b/tensorflow/compiler/xla/tools/parser/hlo_parser.h
index f7854f403e..902c45cebc 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.h
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.h
@@ -36,10 +36,17 @@ StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str,
 // format, parses the string and creates a HloModule with default config.
 StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str);
 
-// Parse sharding from str. str is supposed to contain the body of the
-// sharding, i.e. just the rhs of the "sharding={...}" attribute string.
+// Parses the result of HloSharding::ToString(), e.g. "{replicated}".
 StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
 
+// Parses the result of window_util::ToString(const Window&).
+StatusOr<Window> ParseWindow(tensorflow::StringPiece str);
+
+// Parses the result of ConvolutionDimensionNumbersToString(), e.g.
+// "b0f_0io->b0f".
+StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
+    tensorflow::StringPiece str);
+
 }  // namespace tools
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index 183b1121cd..f7a27cf9cc 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 #include <string>
+#include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -1349,6 +1350,26 @@ ENTRY entry {
       "was parsing 8:39: error: instruction does not exist: aparam");
 }
 
+TEST_F(HloParserTest, ParseSharding) {
+  const string original = "{maximal device=42}";
+  TF_ASSERT_OK_AND_ASSIGN(HloSharding sharding, ParseSharding(original));
+  EXPECT_EQ(sharding.ToString(), original);
+}
+
+TEST_F(HloParserTest, ParseWindow) {
+  Window original = window_util::MakeWindow({1, 2, 3});
+  TF_ASSERT_OK_AND_ASSIGN(Window parsed,
+                          ParseWindow(window_util::ToString(original)))
+  EXPECT_EQ(window_util::ToString(original), window_util::ToString(parsed));
+}
+
+TEST_F(HloParserTest, ParseConvolutionDimensionNumbers) {
+  const string original = "b0f_0io->b0f";
+  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                          ParseConvolutionDimensionNumbers(original));
+  EXPECT_EQ(original, ConvolutionDimensionNumbersToString(dnums));
+}
+
 }  // namespace
 }  // namespace tools
 }  // namespace xla
-- 
GitLab


From 69340bdffcc1507e39880decfb467f8d68981a86 Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Wed, 30 May 2018 18:11:10 -0700
Subject: [PATCH 088/610] Remove code returning bad status when the input
 pointer is nullptr in internal functions. That should be a programmatic error
 and we have full control of internal functions, so it is OK to crash if error
 happens.

PiperOrigin-RevId: 198651749
---
 .../core/platform/cloud/gcs_file_system.cc    | 28 -------------------
 1 file changed, 28 deletions(-)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index 5f612b5f53..d3a1489b9c 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -129,9 +129,6 @@ constexpr char kInitialTokens[] = "GCS_INITIAL_TOKENS";
 
 // TODO: DO NOT use a hardcoded path
 Status GetTmpFilename(string* filename) {
-  if (!filename) {
-    return errors::Internal("'filename' cannot be nullptr.");
-  }
 #ifndef _WIN32
   char buffer[] = "/tmp/gcs_filesystem_XXXXXX";
   int fd = mkstemp(buffer);
@@ -158,9 +155,6 @@ Status GetTmpFilename(string* filename) {
 /// object is empty.
 Status ParseGcsPath(StringPiece fname, bool empty_object_ok, string* bucket,
                     string* object) {
-  if (!bucket || !object) {
-    return errors::Internal("bucket and object cannot be null.");
-  }
   StringPiece scheme, bucketp, objectp;
   io::ParseURI(fname, &scheme, &bucketp, &objectp);
   if (scheme != "gs") {
@@ -448,9 +442,6 @@ class GcsWritableFile : public WritableFile {
   }
 
   Status GetCurrentFileSize(uint64* size) {
-    if (size == nullptr) {
-      return errors::Internal("'size' cannot be nullptr");
-    }
     const auto tellp = outfile_.tellp();
     if (tellp == static_cast<std::streampos>(-1)) {
       return errors::Internal(
@@ -462,9 +453,6 @@ class GcsWritableFile : public WritableFile {
 
   /// Initiates a new resumable upload session.
   Status CreateNewUploadSession(string* session_uri) {
-    if (session_uri == nullptr) {
-      return errors::Internal("'session_uri' cannot be nullptr.");
-    }
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -498,9 +486,6 @@ class GcsWritableFile : public WritableFile {
   /// uploaded size in bytes.
   Status RequestUploadSessionStatus(const string& session_uri, bool* completed,
                                     uint64* uploaded) {
-    if (completed == nullptr || uploaded == nullptr) {
-      return errors::Internal("'completed' and 'uploaded' cannot be nullptr.");
-    }
     uint64 file_size;
     TF_RETURN_IF_ERROR(GetCurrentFileSize(&file_size));
 
@@ -984,9 +969,6 @@ Status GcsFileSystem::FileExists(const string& fname) {
 
 Status GcsFileSystem::ObjectExists(const string& fname, const string& bucket,
                                    const string& object, bool* result) {
-  if (!result) {
-    return errors::Internal("'result' cannot be nullptr.");
-  }
   GcsFileStat stat;
   const Status status = StatForObject(fname, bucket, object, &stat);
   switch (status.code()) {
@@ -1058,9 +1040,6 @@ Status GcsFileSystem::UncachedStatForObject(const string& fname,
 
 Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
                                     const string& object, GcsFileStat* stat) {
-  if (!stat) {
-    return errors::Internal("'stat' cannot be nullptr.");
-  }
   if (object.empty()) {
     return errors::InvalidArgument(strings::Printf(
         "'object' must be a non-empty string. (File: %s)", fname.c_str()));
@@ -1075,10 +1054,6 @@ Status GcsFileSystem::StatForObject(const string& fname, const string& bucket,
 }
 
 Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
-  if (!result) {
-    return errors::Internal("'result' cannot be nullptr.");
-  }
-
   std::unique_ptr<HttpRequest> request;
   TF_RETURN_IF_ERROR(CreateHttpRequest(&request));
   request->SetUri(strings::StrCat(kGcsUriBase, "b/", bucket));
@@ -1097,9 +1072,6 @@ Status GcsFileSystem::BucketExists(const string& bucket, bool* result) {
 }
 
 Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
-  if (!result) {
-    return errors::Internal("'result' cannot be nullptr.");
-  }
   StatCache::ComputeFunc compute_func = [this](const string& dirname,
                                                GcsFileStat* stat) {
     std::vector<string> children;
-- 
GitLab


From 1479382c92d371843199ec6eb888b05609bf288f Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 30 May 2018 18:35:42 -0700
Subject: [PATCH 089/610] Expose xla_disable_hlo_passes via
 ExecutableBuildOptions.

PiperOrigin-RevId: 198654099
---
 tensorflow/compiler/xla/client/BUILD                     | 1 +
 .../compiler/xla/client/executable_build_options.h       | 9 +++++++++
 tensorflow/compiler/xla/service/local_service.cc         | 6 ++++++
 3 files changed, 16 insertions(+)

diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index aacb394ae5..c4f0c4468f 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -86,6 +86,7 @@ cc_library(
     hdrs = ["executable_build_options.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 11f1098360..393da381fb 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_EXECUTABLE_BUILD_OPTIONS_H_
 
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/optional.h"
@@ -76,6 +77,13 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_hlo_profile(bool enabled);
   tensorflow::gtl::optional<bool> hlo_profile() const;
 
+  void add_disabled_hlo_pass(tensorflow::StringPiece pass_name) {
+    disabled_hlo_passes_.push_back(std::string(pass_name));
+  }
+  const tensorflow::gtl::ArraySlice<std::string> disabled_hlo_passes() const {
+    return disabled_hlo_passes_;
+  }
+
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
@@ -89,6 +97,7 @@ class ExecutableBuildOptions {
   tensorflow::gtl::optional<string> dump_optimized_hlo_proto_to_;
   tensorflow::gtl::optional<string> dump_per_pass_hlo_proto_to_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
+  std::vector<std::string> disabled_hlo_passes_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 41aef3920c..f54b52beae 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -124,6 +124,12 @@ ExecutionOptions CreateExecutionOptions(
     LayoutUtil::SetToDefaultLayout(
         execution_options.mutable_shape_with_output_layout());
   }
+
+  for (const std::string& disabled_pass : build_options.disabled_hlo_passes()) {
+    execution_options.mutable_debug_options()->add_xla_disable_hlo_passes(
+        disabled_pass);
+  }
+
   return execution_options;
 }
 
-- 
GitLab


From d0f9424e22eb438f3d846fa62feaf331797e62c4 Mon Sep 17 00:00:00 2001
From: HyoukJoong Lee <hyouklee@google.com>
Date: Wed, 30 May 2018 18:43:40 -0700
Subject: [PATCH 090/610] Automated g4 rollback of changelist 195379693

PiperOrigin-RevId: 198654780
---
 .../xla/service/hlo_module_group_metadata.cc        |  7 +++++++
 .../xla/service/hlo_module_group_metadata.h         |  3 +++
 tensorflow/compiler/xla/service/service.cc          | 13 ++++++++++---
 3 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 7d706b5fd0..f6fa45a6b7 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -247,6 +247,13 @@ tensorflow::gtl::optional<int64> HloModuleGroupMetadata::GetInstructionDevice(
   return device;
 }
 
+int64 HloModuleGroupMetadata::GetDeviceModulesCount() const {
+  return std::count_if(modules_.begin(), modules_.end(),
+                       [](const HloModule* module) {
+                         return !module->config().is_host_module();
+                       });
+}
+
 Status HloModuleGroupMetadata::RecordInstructions() {
   const auto visitor = [this](HloInstruction* hlo) -> Status {
     if (hlo->opcode() == HloOpcode::kWhile) {
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index 5f5bf27479..f68d4028dc 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -155,6 +155,9 @@ class HloModuleGroupMetadata {
   tensorflow::gtl::optional<int64> GetInstructionDevice(
       const HloInstruction& instruction) const;
 
+  // Returns the number of modules for devices (excluding the host module).
+  int64 GetDeviceModulesCount() const;
+
   // Returns the companion instructions for the given instruction.
   //
   // Precondition: IsCompanionWhile(instruction) is true.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index cb0f76ebe4..5a813dcadc 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -624,9 +624,16 @@ Service::ExecuteParallelAndRegisterResult(
   // profiled.
   std::map<int64, se::Stream*> index_to_profiled_streams;
 
-  TF_ASSIGN_OR_RETURN(DeviceAssignment device_assignment,
-                      backend->computation_placer()->AssignDevices(
-                          options_.number_of_replicas(), executables.size()));
+  // Build DeviceAssignment for all cores based on the provided device handles.
+  DeviceAssignment device_assignment(options_.number_of_replicas(),
+                                     executables.size());
+  for (int64 i = 0; i < executables.size(); i++) {
+    TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*backend, device_handles[i]));
+    CHECK_EQ(replicas.size(), arguments[i].size());
+    for (int64 replica = 0; replica < replicas.size(); ++replica) {
+      device_assignment(replica, i) = replicas[replica]->device_ordinal();
+    }
+  }
 
   for (int64 i = 0; i < executables.size(); i++) {
     // Stream executors for the replicas of the current computation.
-- 
GitLab


From 5be69b0c5e0087acedffe4e94a716c0b5ed320fb Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Wed, 30 May 2018 19:01:58 -0700
Subject: [PATCH 091/610] Add a subclassed Model's attribute-assigned variables
 to Model.weights et al

Makes the Variable.trainable property public, which is sensible if we're discouraging use of the global collection (currently eager execution is using ResourceVariable._trainable in a bunch of places anyway). I'm leaving it read-only for now, since we should toggle in and out of the global collection when it changes.

Same change for checkpointable data structures with respect to gathering extra variables. They'll behave like subclassed Models.

I think this makes more sense than trying to have a distinction between "variables" and "weights". It's also more sensible than collecting everything that would get checkpointed, since that will include Optimizer slot variables and metrics. Collecting those is generally pointless, and accidentally adding them to gradient tapes would be horribly confusing.

PiperOrigin-RevId: 198656079
---
 tensorflow/core/framework/variable.proto      |  3 +
 tensorflow/python/eager/function.py           |  2 +-
 tensorflow/python/eager/graph_callable.py     |  2 +-
 tensorflow/python/eager/pywrap_tfe_src.cc     |  4 +-
 tensorflow/python/keras/engine/network.py     | 52 +++++++++++-------
 .../python/keras/model_subclassing_test.py    | 45 +++++++++++++++
 tensorflow/python/keras/utils/layer_utils.py  | 55 +++++++++++++++++++
 .../resource_variable_ops_test.py             | 19 +++++++
 .../python/kernel_tests/variables_test.py     | 17 ++++++
 .../python/ops/resource_variable_ops.py       |  8 ++-
 tensorflow/python/ops/variable_scope.py       |  6 +-
 tensorflow/python/ops/variables.py            |  7 +++
 .../checkpointable/data_structures.py         | 36 +++++++-----
 .../checkpointable/data_structures_test.py    | 19 +++++++
 .../api/golden/tensorflow.-variable.pbtxt     |  4 ++
 15 files changed, 233 insertions(+), 46 deletions(-)

diff --git a/tensorflow/core/framework/variable.proto b/tensorflow/core/framework/variable.proto
index 93ae423bab..66ba4cba7d 100644
--- a/tensorflow/core/framework/variable.proto
+++ b/tensorflow/core/framework/variable.proto
@@ -26,6 +26,9 @@ message VariableDef {
 
   // Whether to represent this as a ResourceVariable.
   bool is_resource = 5;
+
+  // Whether this variable should be trained.
+  bool trainable = 7;
 }
 
 message SaveSliceInfoDef {
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 23d87fb394..559063d6ae 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -494,7 +494,7 @@ class GraphModeFunction(object):
   def __call__(self, *args):
     """Executes the passed function in eager mode."""
     for v in self._variables:
-      if v._trainable:  # pylint: disable=protected-access
+      if v.trainable:
         tape.watch_variable(v)
 
     tensor_inputs = [x for x in nest.flatten(args) if isinstance(x, ops.Tensor)]
diff --git a/tensorflow/python/eager/graph_callable.py b/tensorflow/python/eager/graph_callable.py
index d9ffcbd203..760a148552 100644
--- a/tensorflow/python/eager/graph_callable.py
+++ b/tensorflow/python/eager/graph_callable.py
@@ -202,7 +202,7 @@ class _InitializingFunctionObject(object):
         v.handle).numpy() for v in self._call_fn.variables]
     if all(x for x in initialized):
       for v in self._call_fn.variables:
-        if v._trainable:  # pylint: disable=protected-access
+        if v.trainable:
           tape.watch_variable(v)
       return self._call_fn(*args)
     elif all(not x for x in initialized):
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 52b90504f3..e3ce0ef9d0 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1874,10 +1874,10 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
 
 void MaybeWatchVariable(PyObject* input) {
   DCHECK(CheckResourceVariable(input));
-  DCHECK(PyObject_HasAttrString(input, "_trainable"));
+  DCHECK(PyObject_HasAttrString(input, "trainable"));
 
   tensorflow::Safe_PyObjectPtr trainable(
-      PyObject_GetAttrString(input, "_trainable"));
+      PyObject_GetAttrString(input, "trainable"));
   if (trainable.get() == Py_False) return;
   TFE_Py_TapeSetWatchVariable(input);
 }
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 6db41472b6..f63ca1a207 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -36,9 +36,10 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.keras.utils.io_utils import ask_to_proceed_with_overwrite
-from tensorflow.python.keras.utils.layer_utils import print_summary as print_layer_summary
+from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.training.checkpointable import data_structures_base
@@ -94,6 +95,11 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
+    # A list of "extra" variables assigned to attributes of this class, included
+    # in self.weights and self.variables. Always empty for graph networks (but
+    # included in base_init to avoid excessive special casing when retrieving
+    # the value).
+    self._extra_variables = []
 
     self.supports_masking = False
     if not hasattr(self, 'optimizer'):
@@ -347,11 +353,22 @@ class Network(base_layer.Layer):
       # layers). Therefore Model tracks Checkpointable objects itself.
       self._track_checkpointable(
           checkpointable=value, name=name, overwrite=True)
+      if (  # For subclassed models only, users may add extra weights/variables
+            # simply by assigning them to attributes.
+          not self._is_graph_network
+          and isinstance(value, variables.Variable)):
+        self._extra_variables.append(value)
     super(Network, self).__setattr__(name, value)
 
   def add_variable(self, name, shape, dtype=None, initializer=None,
                    regularizer=None, trainable=True, constraint=None):
-    raise NotImplementedError('`add_variable` is not supported on Networks.')
+    if self._is_graph_network:
+      raise NotImplementedError('`add_variable` is not supported on Networks.')
+    else:
+      raise NotImplementedError(
+          '`add_variable` is not supported on Networks. However, you may '
+          'assign variables to attributes and they will show up in the weights '
+          'and variables properties.')
 
   def add_loss(self, *args, **kwargs):
     if context.executing_eagerly():
@@ -589,24 +606,17 @@ class Network(base_layer.Layer):
 
   @property
   def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for layer in self.layers:
-      weights += layer.trainable_weights
-    return weights
+    return layer_utils.gather_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
 
   @property
   def non_trainable_weights(self):
-    weights = []
-    for layer in self.layers:
-      weights += layer.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for layer in self.layers:
-        trainable_weights += layer.trainable_weights
-      return trainable_weights + weights
-    return weights
+    return layer_utils.gather_non_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
 
   @property
   def input_spec(self):
@@ -1437,10 +1447,10 @@ class Network(base_layer.Layer):
                        'have not yet been created, so no summary can be '
                        'displayed. Build the model first '
                        '(e.g. by calling it on some data).')
-    print_layer_summary(self,
-                        line_length=line_length,
-                        positions=positions,
-                        print_fn=print_fn)
+    layer_utils.print_summary(self,
+                              line_length=line_length,
+                              positions=positions,
+                              print_fn=print_fn)
 
 
 def get_source_inputs(tensor, layer=None, node_index=None):
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 558854ab97..86f7e20bec 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -622,6 +622,51 @@ class ModelSubclassingTest(test.TestCase):
     self.assertIs(m.isdep, m._checkpoint_dependencies[0].ref)
     self.assertEqual('notdep_var:0', m.notdep_var.name)
 
+  def test_extra_variable(self):
+
+    class ExtraVar(keras.Model):
+
+      def __init__(self):
+        super(ExtraVar, self).__init__()
+        self.dense = keras.layers.Dense(1)
+        self.var = resource_variable_ops.ResourceVariable(1.)
+        self.not_trainable_var = resource_variable_ops.ResourceVariable(
+            2., trainable=False)
+
+      def call(self, inputs):
+        return self.dense(inputs + self.var)
+
+    m = ExtraVar()
+    self.assertTrue(m.trainable)
+    self.assertEqual([m.dense], m.layers)
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = False
+    self.assertEqual([m.var, m.not_trainable_var], m.variables)
+    self.assertEqual([], m.trainable_variables)
+    self.assertEqual([m.var, m.not_trainable_var], m.non_trainable_variables)
+    m.trainable = True
+
+    m(array_ops.ones([1, 1]))
+
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias], m.dense.weights)
+
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var, m.not_trainable_var],
+                     m.variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.var],
+                     m.trainable_variables)
+    self.assertEqual([m.not_trainable_var], m.non_trainable_variables)
+
+    m.dense.trainable = False
+    self.assertEqual(
+        [m.var, m.dense.kernel, m.dense.bias, m.not_trainable_var],
+        m.variables)
+    self.assertEqual([m.var], m.trainable_variables)
+    self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
+                     m.non_trainable_variables)
+
 
 class CustomCallModel(keras.Model):
 
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index bd61f8e9cc..88daff0461 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -201,6 +201,61 @@ def print_summary(model, line_length=None, positions=None, print_fn=None):
   print_fn('_' * line_length)
 
 
+def gather_trainable_weights(trainable, sub_layers, extra_variables):
+  """Lists the trainable weights for an object with sub-layers.
+
+  Args:
+    trainable: Whether the object collecting the variables is trainable.
+    sub_layers: A flat list of Layer objects owned by this object, to collect
+      variables from.
+    extra_variables: Any extra variables to include. Their `.trainable` property
+      is used to categorize them.
+
+  Returns:
+    A list of collected trainable weights/variables.
+  """
+  if not trainable:
+    return []
+  weights = []
+  for layer in sub_layers:
+    weights += layer.trainable_weights
+  trainable_extra_variables = [
+      v for v in extra_variables if v.trainable]
+  return weights + trainable_extra_variables
+
+
+def gather_non_trainable_weights(trainable, sub_layers, extra_variables):
+  """Lists the non-trainable weights for an object with sub-layers.
+
+  Args:
+    trainable: Whether the object collecting the variables is trainable.
+    sub_layers: A flat list of Layer objects owned by this object, to collect
+      variables from.
+    extra_variables: Any extra variables to include. Their `.trainable` property
+      is used to categorize them.
+
+  Returns:
+    A list of collected non-trainable weights/variables.
+  """
+  trainable_extra_variables = []
+  non_trainable_extra_variables = []
+  for v in extra_variables:
+    if v.trainable:
+      trainable_extra_variables.append(v)
+    else:
+      non_trainable_extra_variables.append(v)
+  weights = []
+  for layer in sub_layers:
+    weights += layer.non_trainable_weights
+  if not trainable:
+    trainable_weights = []
+    for layer in sub_layers:
+      trainable_weights += layer.trainable_weights
+    return (trainable_weights + trainable_extra_variables
+            + weights + non_trainable_extra_variables)
+  return weights + non_trainable_extra_variables
+
+
 @tf_export('keras.utils.convert_all_kernels_in_model')
 def convert_all_kernels_in_model(model):
   """Converts all convolution kernels in a model from Theano to TensorFlow.
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 972fbdb3d6..00d517e64e 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -538,6 +538,25 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         sess.run(v.initialized_value())
 
+  def testTrainableInProto(self):
+    with ops.Graph().as_default():
+      non_trainable_variable = resource_variable_ops.ResourceVariable(
+          trainable=False,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          False,
+          resource_variable_ops.ResourceVariable(
+              variable_def=non_trainable_variable.to_proto())
+          .trainable)
+      trainable_variable = resource_variable_ops.ResourceVariable(
+          trainable=True,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          True,
+          resource_variable_ops.ResourceVariable(
+              variable_def=trainable_variable.to_proto())
+          .trainable)
+
   @test_util.run_in_graph_and_eager_modes()
   def testSparseRead(self):
     with self.test_session():
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index 27599868b7..62d596da91 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -496,6 +496,23 @@ class VariablesTestCase(test.TestCase):
       with self.assertRaises(ValueError):
         sess.run(v.initialized_value())
 
+  def testTrainableInProto(self):
+    with ops.Graph().as_default():
+      non_trainable_variable = variables.Variable(
+          trainable=False,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          False,
+          variables.Variable(variable_def=non_trainable_variable.to_proto())
+          .trainable)
+      trainable_variable = variables.Variable(
+          trainable=True,
+          initial_value=constant_op.constant(10.0))
+      self.assertEqual(
+          True,
+          variables.Variable(variable_def=trainable_variable.to_proto())
+          .trainable)
+
   def testLoad(self):
     with self.test_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index e37e93ea35..7061b32808 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -551,6 +551,7 @@ class ResourceVariable(variables.Variable):
                                  import_scope=import_scope))
     else:
       self._initial_value = None
+    self._trainable = getattr(variable_def, "trainable", True)
     if variable_def.snapshot_name:
       snapshot = g.as_graph_element(
           ops.prepend_name_scope(
@@ -735,7 +736,7 @@ class ResourceVariable(variables.Variable):
     return self._save_slice_info
 
   def _read_variable_op(self):
-    if hasattr(self, "_trainable") and self._trainable:
+    if self.trainable:
       tape.watch_variable(self)
     return gen_resource_variable_ops.read_variable_op(self._handle,
                                                       self._dtype)
@@ -760,7 +761,7 @@ class ResourceVariable(variables.Variable):
   def sparse_read(self, indices, name=None):
     """Reads the value of this variable sparsely, using `gather`."""
     with ops.name_scope("Gather" if name is None else name) as name:
-      if self._trainable:
+      if self.trainable:
         tape.watch_variable(self)
       value = gen_resource_variable_ops.resource_gather(
           self._handle, indices, dtype=self._dtype, name=name)
@@ -801,6 +802,7 @@ class ResourceVariable(variables.Variable):
         var_def.snapshot_name = ops.strip_name_scope(self._graph_element.name,
                                                      export_scope)
       var_def.is_resource = True
+      var_def.trainable = self.trainable
       if self._save_slice_info:
         var_def.save_slice_info_def.MergeFrom(
             self._save_slice_info.to_proto(export_scope=export_scope))
@@ -913,7 +915,7 @@ class ResourceVariable(variables.Variable):
     return assign_add_op
 
   def _lazy_read(self, op):
-    if hasattr(self, "_trainable") and self._trainable:
+    if self.trainable:
       tape.watch_variable(self)
     return _UnreadVariable(
         self._handle, self.dtype, self._shape, self._in_graph_mode,
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index 8d93d24b14..fa34774622 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1261,13 +1261,13 @@ class EagerVariableStore(object):
 
   def trainable_variables(self):
     # pylint: disable=protected-access
-    return sorted([x for x in self._store._vars.values() if x._trainable],
+    return sorted([x for x in self._store._vars.values() if x.trainable],
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
   def non_trainable_variables(self):
     # pylint: disable=protected-access
-    return sorted([x for x in self._store._vars.values() if not x._trainable],
+    return sorted([x for x in self._store._vars.values() if not x.trainable],
                   key=lambda x: x.name)
     # pylint: enable=protected-access
 
@@ -1296,7 +1296,7 @@ class EagerVariableStore(object):
       new_var = resource_variable_ops.ResourceVariable(
           var.read_value(),
           name=stripped_var_name,
-          trainable=var._trainable)
+          trainable=var.trainable)
       new_store._store._vars[key] = new_var
     return new_store
     # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index d88fd836f5..4be9f5eb68 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -341,6 +341,7 @@ class Variable(checkpointable.CheckpointableBase):
       self._update_uid = initial_value.checkpoint_position.restore_uid
       initial_value = initial_value.wrapped_value
 
+    self._trainable = trainable
     if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
       collections = list(collections) + [ops.GraphKeys.TRAINABLE_VARIABLES]
     with ops.init_scope():
@@ -450,6 +451,7 @@ class Variable(checkpointable.CheckpointableBase):
                                  import_scope=import_scope))
     else:
       self._initial_value = None
+    self._trainable = getattr(variable_def, "trainable", True)
     self._snapshot = g.as_graph_element(
         ops.prepend_name_scope(variable_def.snapshot_name,
                                import_scope=import_scope))
@@ -543,6 +545,10 @@ class Variable(checkpointable.CheckpointableBase):
     self._ref().set_shape(shape)
     self.value().set_shape(shape)
 
+  @property
+  def trainable(self):
+    return self._trainable
+
   def eval(self, session=None):
     """In a session, computes and returns the value of this variable.
 
@@ -1050,6 +1056,7 @@ class Variable(checkpointable.CheckpointableBase):
         # For backwards compatibility.
         var_def.initial_value_name = ops.strip_name_scope(
             self._initial_value.name, export_scope)
+      var_def.trainable = self.trainable
       var_def.initializer_name = ops.strip_name_scope(
           self.initializer.name, export_scope)
       var_def.snapshot_name = ops.strip_name_scope(
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index 62cefa4f20..69ed253fb2 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -22,6 +22,8 @@ import collections
 import six
 
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.utils import layer_utils
+from tensorflow.python.ops import variables
 from tensorflow.python.training.checkpointable import base as checkpointable_lib
 from tensorflow.python.training.checkpointable import data_structures_base
 
@@ -41,11 +43,14 @@ class CheckpointableDataStructure(
   def __init__(self):
     self._layers = []
     self.trainable = True
+    self._extra_variables = []
 
   def _track_value(self, value, name):
     """Add a dependency on `value`."""
     if isinstance(value, checkpointable_lib.CheckpointableBase):
       self._track_checkpointable(value, name=name)
+      if isinstance(value, variables.Variable):
+        self._extra_variables.append(value)
     else:
       raise ValueError(
           ("Only checkpointable objects (such as Layers or Optimizers) may be "
@@ -67,29 +72,30 @@ class CheckpointableDataStructure(
 
   @property
   def trainable_weights(self):
-    if not self.trainable:
-      return []
-    weights = []
-    for layer in self.layers:
-      weights += layer.trainable_weights
-    return weights
+    return layer_utils.gather_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
 
   @property
   def non_trainable_weights(self):
-    weights = []
-    for layer in self.layers:
-      weights += layer.non_trainable_weights
-    if not self.trainable:
-      trainable_weights = []
-      for layer in self.layers:
-        trainable_weights += layer.trainable_weights
-      return trainable_weights + weights
-    return weights
+    return layer_utils.gather_non_trainable_weights(
+        trainable=self.trainable,
+        sub_layers=self.layers,
+        extra_variables=self._extra_variables)
 
   @property
   def weights(self):
     return self.trainable_weights + self.non_trainable_weights
 
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
   @property
   def variables(self):
     return self.weights
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index 31a0e8b622..b05b3a8800 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -139,6 +139,25 @@ class ListTests(test.TestCase):
           outer.variables[0],
           resource_variable_ops.ResourceVariable)
 
+  def testNonLayerVariables(self):
+    v = resource_variable_ops.ResourceVariable([1.])
+    l = data_structures.List([v])
+    self.assertTrue(l.trainable)
+    self.assertEqual([], l.layers)
+    self.assertEqual([v], l.variables)
+    self.assertEqual([v], l.trainable_weights)
+    self.assertEqual([], l.non_trainable_variables)
+    l.trainable = False
+    self.assertEqual([v], l.variables)
+    self.assertEqual([], l.trainable_variables)
+    self.assertEqual([v], l.non_trainable_variables)
+    l.trainable = True
+    v2 = resource_variable_ops.ResourceVariable(1., trainable=False)
+    l.append(v2)
+    self.assertEqual([v, v2], l.weights)
+    self.assertEqual([v], l.trainable_weights)
+    self.assertEqual([v2], l.non_trainable_weights)
+
   def testHashing(self):
     has_sequences = set([data_structures.List(),
                          data_structures.List()])
diff --git a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
index 8c8912dfab..23b552cc38 100644
--- a/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.-variable.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "trainable"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'initial_value\', \'trainable\', \'collections\', \'validate_shape\', \'caching_device\', \'name\', \'variable_def\', \'dtype\', \'expected_shape\', \'import_scope\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'True\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-- 
GitLab


From f33d551ea6ed6a46c70cafd3a567933fe1159ddf Mon Sep 17 00:00:00 2001
From: Nick Felt <nickfelt@google.com>
Date: Wed, 30 May 2018 19:27:26 -0700
Subject: [PATCH 092/610] Add GCS_READ_CACHE_DISABLED explicit env var to
 GcsFileSystem

PiperOrigin-RevId: 198658074
---
 tensorflow/core/platform/cloud/gcs_file_system.cc     | 8 ++++++++
 tensorflow/core/platform/cloud/ram_file_block_cache.h | 2 ++
 2 files changed, 10 insertions(+)

diff --git a/tensorflow/core/platform/cloud/gcs_file_system.cc b/tensorflow/core/platform/cloud/gcs_file_system.cc
index d3a1489b9c..22ae6121e0 100644
--- a/tensorflow/core/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/core/platform/cloud/gcs_file_system.cc
@@ -64,6 +64,10 @@ constexpr uint64 HTTP_CODE_RESUME_INCOMPLETE = 308;
 // The environment variable that overrides the size of the readahead buffer.
 // DEPRECATED. Use GCS_BLOCK_SIZE_MB instead.
 constexpr char kReadaheadBufferSize[] = "GCS_READAHEAD_BUFFER_SIZE_BYTES";
+// The environment variable that disables the GCS block cache for reads.
+// This is the explicit alternative to setting BLOCK_SIZE or MAX_SIZE to 0, and
+// takes precedence over either of those environment variables.
+constexpr char kReadCacheDisabled[] = "GCS_READ_CACHE_DISABLED";
 // The environment variable that overrides the block size for aligned reads from
 // GCS. Specified in MB (e.g. "16" = 16 x 1024 x 1024 = 16777216 bytes).
 constexpr char kBlockSize[] = "GCS_READ_CACHE_BLOCK_SIZE_MB";
@@ -623,6 +627,10 @@ GcsFileSystem::GcsFileSystem()
   if (GetEnvVar(kMaxStaleness, strings::safe_strtou64, &value)) {
     max_staleness = value;
   }
+  if (std::getenv(kReadCacheDisabled)) {
+    // Setting either to 0 disables the cache; set both for good measure.
+    block_size = max_bytes = 0;
+  }
   file_block_cache_ = MakeFileBlockCache(block_size, max_bytes, max_staleness);
   // Apply overrides for the stat cache max age and max entries, if provided.
   uint64 stat_cache_max_age = kStatCacheDefaultMaxAge;
diff --git a/tensorflow/core/platform/cloud/ram_file_block_cache.h b/tensorflow/core/platform/cloud/ram_file_block_cache.h
index 2303f9caaa..46fb9a35b8 100644
--- a/tensorflow/core/platform/cloud/ram_file_block_cache.h
+++ b/tensorflow/core/platform/cloud/ram_file_block_cache.h
@@ -60,6 +60,8 @@ class RamFileBlockCache : public FileBlockCache {
       pruning_thread_.reset(env_->StartThread(ThreadOptions(), "TF_prune_FBC",
                                               [this] { Prune(); }));
     }
+    VLOG(1) << "GCS file block cache is "
+            << (IsCacheEnabled() ? "enabled" : "disabled");
   }
 
   ~RamFileBlockCache() override {
-- 
GitLab


From 52a21f5df5ba0c7eeae91e4f818a6f2b989734cb Mon Sep 17 00:00:00 2001
From: Jingyue Wu <jingyue@google.com>
Date: Wed, 30 May 2018 22:00:32 -0700
Subject: [PATCH 093/610] Improve ReshapeIsIdentity to work with symbolic
 shapes.

For example, with this CL, ArithmeticOptimizer can optimize the Reshape below
into a no-op.

  s = Shape(t)
  Reshape(t, Concat(s[0], s[1], s[2], s[3]))

PiperOrigin-RevId: 198668726
---
 .../optimizers/arithmetic_optimizer.cc        | 35 +---------------
 .../optimizers/arithmetic_optimizer_test.cc   | 40 +++++++++++++++++++
 2 files changed, 41 insertions(+), 34 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 9c18c45f18..e7f385cbd6 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -209,40 +209,7 @@ bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
     return false;
   }
 
-  const PartialTensorShape& src_shape = input_props[output_pos].shape();
-  const PartialTensorShape& dst_shape = reshape_props[0].shape();
-
-  if (src_shape.unknown_rank() || dst_shape.unknown_rank()) {
-    return false;
-  }
-
-  if (!dst_shape.IsCompatibleWith(src_shape)) {
-    return false;
-  }
-
-  // Returns false when src_shape or dst_shape has >=2 dimensions with unknown
-  // sizes.
-  auto num_unknown_dim_sizes = [](const PartialTensorShape& partial_shape) {
-    auto dim_sizes = partial_shape.dim_sizes();
-    return std::count_if(dim_sizes.begin(), dim_sizes.end(),
-                         [](int dim) { return dim < 0; });
-  };
-  int src_num_unknown_dim_sizes = num_unknown_dim_sizes(src_shape);
-  int dst_num_unknown_dim_sizes = num_unknown_dim_sizes(dst_shape);
-  if (src_num_unknown_dim_sizes > 1 || dst_num_unknown_dim_sizes > 1) {
-    return false;
-  }
-
-  // If dst_num_unknown_dim_sizes != src_num_unknown_dim_sizes we would weaken
-  // shape inference in subsequent passes if we removed this reshape.
-  if (src_num_unknown_dim_sizes != dst_num_unknown_dim_sizes) {
-    return false;
-  }
-
-  // Remove the reshape if both are fully defined or partially defined and the
-  // unknown or symbolic shape appears on the same dimension, i.e., if
-  // IsIdenticalTo returns true.
-  return dst_shape.IsIdenticalTo(src_shape);
+  return ShapesSymbolicallyEqual(input_props[output_pos], reshape_props[0]);
 }
 
 NodeDef* GetTailOfValuePreservingChain(
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index a908416e45..f678ea7227 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -989,6 +989,46 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+TEST_F(ArithmeticOptimizerTest, IdentityReshapeBetweenSymbolicShapes) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output inputs =
+      ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, -1, -1}));
+  Output inputs_shape = ops::Shape(s, inputs);
+  // The target shape of the reshape is the concatenation of `batch_size`, 3,
+  // `height, and `width`.
+  Output batch_size = ops::Slice(s, inputs_shape, ops::Const(s, {0}, {1}),
+                                 ops::Const(s, {1}, {1}));
+  Output height = ops::Slice(s, inputs_shape, ops::Const(s, {2}, {1}),
+                             ops::Const(s, {1}, {1}));
+  Output width = ops::Slice(s, inputs_shape, ops::Const(s, {3}, {1}),
+                            ops::Const(s, {1}, {1}));
+  Output target_shape =
+      ops::Concat(s.WithOpName("target_shape"),
+                  {batch_size, ops::Const(s, {3}, {1}), height, width},
+                  ops::Const(s, {0}, {}));
+  Output reshape = ops::Reshape(s, inputs, target_shape);
+  Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
+
+  GrapplerItem item;
+  item.fetch = {"outputs"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
+  auto tensors_expected =
+      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors_expected.size());
+  GraphDef output;
+  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
+                   .Optimize(nullptr, item, &output));
+
+  item.graph.Swap(&output);
+  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+
+  EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
+  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 TEST_F(ArithmeticOptimizerTest, NotAssumeValidFeeds) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
-- 
GitLab


From ca4bda919793cc2578e5c0f7440525261da16fdf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 30 May 2018 22:03:16 -0700
Subject: [PATCH 094/610] [XLA] Redesign: delete the old service interface. -
 Computation - ComputeConstant - Execute - ExecuteAsync - ExecuteParallel -
 GetComputationStats - GetComputationShape - GetLocalShape - IsConstant -
 LoadComputationSnapshot - Op - SetReturnValue - SnapshotComputation

PiperOrigin-RevId: 198669035
---
 tensorflow/compiler/xla/client/client.h       |   2 -
 .../compiler/xla/client/xla_client/BUILD      |   1 -
 tensorflow/compiler/xla/rpc/grpc_service.cc   |  88 ---
 tensorflow/compiler/xla/rpc/grpc_service.h    |  47 --
 tensorflow/compiler/xla/rpc/grpc_stub.cc      |  93 ---
 tensorflow/compiler/xla/rpc/grpc_stub.h       |  39 -
 tensorflow/compiler/xla/rpc/xla_service.proto |  60 --
 .../xla/service/compile_only_service.cc       |  52 --
 .../xla/service/compile_only_service.h        |  33 -
 .../compiler/xla/service/local_service.cc     |  64 --
 .../compiler/xla/service/local_service.h      |  12 -
 tensorflow/compiler/xla/service/service.cc    | 704 ------------------
 tensorflow/compiler/xla/service/service.h     |  76 --
 tensorflow/compiler/xla/service_interface.h   |  41 -
 14 files changed, 1312 deletions(-)

diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index cda8a71f71..68f0d0ac78 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -153,8 +153,6 @@ class Client {
   //
   // If output_layout is non-null, then the output of the computation will be
   // stored using that layout.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Literal>> ComputeConstant(
       const XlaComputation& computation,
       const Layout* output_layout = nullptr) const;
diff --git a/tensorflow/compiler/xla/client/xla_client/BUILD b/tensorflow/compiler/xla/client/xla_client/BUILD
index 0d6e207971..507a2dc5f0 100644
--- a/tensorflow/compiler/xla/client/xla_client/BUILD
+++ b/tensorflow/compiler/xla/client/xla_client/BUILD
@@ -37,7 +37,6 @@ cc_library(
     ],
 )
 
-# TODO(b/74197823): Replace computation_builder with xla_builder.
 cc_library(
     name = "xla_builder",
     srcs = ["xla_builder.cc"],
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.cc b/tensorflow/compiler/xla/rpc/grpc_service.cc
index 5f4dc6bd08..4e1435fa30 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service.cc
@@ -32,19 +32,6 @@ namespace xla {
   return tensorflow::ToGrpcStatus(s);
 }
 
-::grpc::Status GRPCService::Computation(::grpc::ServerContext* context,
-                                        const ComputationRequest* arg,
-                                        ComputationResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->Computation(arg, result); });
-}
-
-::grpc::Status GRPCService::CreateOp(::grpc::ServerContext* context,
-                                     const OpRequest* arg, OpResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->Op(arg, result); });
-}
-
 ::grpc::Status GRPCService::Unregister(::grpc::ServerContext* context,
                                        const UnregisterRequest* arg,
                                        UnregisterResponse* result) {
@@ -60,21 +47,6 @@ namespace xla {
   });
 }
 
-::grpc::Status GRPCService::SetReturnValue(::grpc::ServerContext* context,
-                                           const SetReturnValueRequest* arg,
-                                           SetReturnValueResponse* results) {
-  return DelegateRPC([this, arg, results]() {
-    return service_->SetReturnValue(arg, results);
-  });
-}
-
-::grpc::Status GRPCService::Execute(::grpc::ServerContext* context,
-                                    const ExecuteRequest* arg,
-                                    ExecuteResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->Execute(arg, result); });
-}
-
 ::grpc::Status GRPCService::ExecuteGraph(::grpc::ServerContext* /*context*/,
                                          const ExecuteGraphRequest* arg,
                                          ExecuteResponse* result) {
@@ -82,13 +54,6 @@ namespace xla {
       [this, arg, result]() { return service_->ExecuteGraph(arg, result); });
 }
 
-::grpc::Status GRPCService::ExecuteAsync(::grpc::ServerContext* context,
-                                         const ExecuteAsyncRequest* arg,
-                                         ExecuteAsyncResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->ExecuteAsync(arg, result); });
-}
-
 ::grpc::Status GRPCService::WaitForExecution(::grpc::ServerContext* context,
                                              const WaitForExecutionRequest* arg,
                                              WaitForExecutionResponse* result) {
@@ -136,20 +101,6 @@ namespace xla {
       [this, arg, result]() { return service_->ResetDevice(arg, result); });
 }
 
-::grpc::Status GRPCService::IsConstant(::grpc::ServerContext* context,
-                                       const IsConstantRequest* arg,
-                                       IsConstantResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->IsConstant(arg, result); });
-}
-
-::grpc::Status GRPCService::ComputeConstant(::grpc::ServerContext* context,
-                                            const ComputeConstantRequest* arg,
-                                            ComputeConstantResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->ComputeConstant(arg, result); });
-}
-
 ::grpc::Status GRPCService::GetShape(::grpc::ServerContext* context,
                                      const GetShapeRequest* arg,
                                      GetShapeResponse* result) {
@@ -157,43 +108,4 @@ namespace xla {
       [this, arg, result]() { return service_->GetShape(arg, result); });
 }
 
-::grpc::Status GRPCService::GetComputationShape(
-    ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
-    GetComputationShapeResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->GetComputationShape(arg, result);
-  });
-}
-
-::grpc::Status GRPCService::GetLocalShape(::grpc::ServerContext* context,
-                                          const GetLocalShapeRequest* arg,
-                                          GetLocalShapeResponse* result) {
-  return DelegateRPC(
-      [this, arg, result]() { return service_->GetLocalShape(arg, result); });
-}
-
-::grpc::Status GRPCService::GetComputationStats(
-    ::grpc::ServerContext* context, const ComputationStatsRequest* arg,
-    ComputationStatsResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->GetComputationStats(arg, result);
-  });
-}
-
-::grpc::Status GRPCService::SnapshotComputation(
-    ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
-    SnapshotComputationResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->SnapshotComputation(arg, result);
-  });
-}
-
-::grpc::Status GRPCService::LoadComputationSnapshot(
-    ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
-    LoadComputationSnapshotResponse* result) {
-  return DelegateRPC([this, arg, result]() {
-    return service_->LoadComputationSnapshot(arg, result);
-  });
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index 50f02796f2..5cd573167a 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -31,13 +31,6 @@ class GRPCService : public grpc::XlaService::Service {
   static StatusOr<std::unique_ptr<GRPCService>> NewService(
       se::Platform* platform = nullptr);
 
-  ::grpc::Status Computation(::grpc::ServerContext* context,
-                             const ComputationRequest* arg,
-                             ComputationResponse* result) override;
-
-  ::grpc::Status CreateOp(::grpc::ServerContext* context, const OpRequest* arg,
-                          OpResponse* result) override;
-
   ::grpc::Status Unregister(::grpc::ServerContext* context,
                             const UnregisterRequest* arg,
                             UnregisterResponse* result) override;
@@ -46,22 +39,10 @@ class GRPCService : public grpc::XlaService::Service {
                                   const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) override;
 
-  ::grpc::Status SetReturnValue(::grpc::ServerContext* context,
-                                const SetReturnValueRequest* arg,
-                                SetReturnValueResponse* results) override;
-
-  ::grpc::Status Execute(::grpc::ServerContext* context,
-                         const ExecuteRequest* arg,
-                         ExecuteResponse* result) override;
-
   ::grpc::Status ExecuteGraph(::grpc::ServerContext* context,
                               const ExecuteGraphRequest* arg,
                               ExecuteResponse* result) override;
 
-  ::grpc::Status ExecuteAsync(::grpc::ServerContext* context,
-                              const ExecuteAsyncRequest* arg,
-                              ExecuteAsyncResponse* result) override;
-
   ::grpc::Status WaitForExecution(::grpc::ServerContext* context,
                                   const WaitForExecutionRequest* arg,
                                   WaitForExecutionResponse* result) override;
@@ -86,38 +67,10 @@ class GRPCService : public grpc::XlaService::Service {
                              const ResetDeviceRequest* arg,
                              ResetDeviceResponse* result) override;
 
-  ::grpc::Status IsConstant(::grpc::ServerContext* context,
-                            const IsConstantRequest* arg,
-                            IsConstantResponse* result) override;
-
-  ::grpc::Status ComputeConstant(::grpc::ServerContext* context,
-                                 const ComputeConstantRequest* arg,
-                                 ComputeConstantResponse* result) override;
-
   ::grpc::Status GetShape(::grpc::ServerContext* context,
                           const GetShapeRequest* arg,
                           GetShapeResponse* result) override;
 
-  ::grpc::Status GetComputationShape(
-      ::grpc::ServerContext* context, const GetComputationShapeRequest* arg,
-      GetComputationShapeResponse* result) override;
-
-  ::grpc::Status GetLocalShape(::grpc::ServerContext* context,
-                               const GetLocalShapeRequest* arg,
-                               GetLocalShapeResponse* result) override;
-
-  ::grpc::Status GetComputationStats(::grpc::ServerContext* context,
-                                     const ComputationStatsRequest* arg,
-                                     ComputationStatsResponse* result) override;
-
-  ::grpc::Status SnapshotComputation(
-      ::grpc::ServerContext* context, const SnapshotComputationRequest* arg,
-      SnapshotComputationResponse* result) override;
-
-  ::grpc::Status LoadComputationSnapshot(
-      ::grpc::ServerContext* context, const LoadComputationSnapshotRequest* arg,
-      LoadComputationSnapshotResponse* result) override;
-
  private:
   std::unique_ptr<::xla::Service> service_;
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.cc b/tensorflow/compiler/xla/rpc/grpc_stub.cc
index 620ac6cec4..7b8ab158e1 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.cc
@@ -62,21 +62,6 @@ Status GRPCStub::ResetDevice(const ResetDeviceRequest* request,
   });
 }
 
-Status GRPCStub::LoadComputationSnapshot(
-    const LoadComputationSnapshotRequest* request,
-    LoadComputationSnapshotResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->LoadComputationSnapshot(context, *request, response);
-  });
-}
-
-Status GRPCStub::Execute(const ExecuteRequest* request,
-                         ExecuteResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->Execute(context, *request, response);
-  });
-}
-
 Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
                               ExecuteResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -84,13 +69,6 @@ Status GRPCStub::ExecuteGraph(const ExecuteGraphRequest* request,
   });
 }
 
-Status GRPCStub::ExecuteParallel(const ExecuteParallelRequest* request,
-                                 ExecuteParallelResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteParallel(context, *request, response);
-  });
-}
-
 Status GRPCStub::ExecuteGraphParallel(
     const ExecuteGraphParallelRequest* request,
     ExecuteParallelResponse* response) {
@@ -99,13 +77,6 @@ Status GRPCStub::ExecuteGraphParallel(
   });
 }
 
-Status GRPCStub::ExecuteAsync(const ExecuteAsyncRequest* request,
-                              ExecuteAsyncResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ExecuteAsync(context, *request, response);
-  });
-}
-
 Status GRPCStub::WaitForExecution(const WaitForExecutionRequest* request,
                                   WaitForExecutionResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -120,13 +91,6 @@ Status GRPCStub::DeconstructTuple(const DeconstructTupleRequest* request,
   });
 }
 
-Status GRPCStub::GetComputationStats(const ComputationStatsRequest* request,
-                                     ComputationStatsResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->GetComputationStats(context, *request, response);
-  });
-}
-
 Status GRPCStub::GetComputationGraphStats(
     const ComputationGraphStatsRequest* request,
     ComputationStatsResponse* response) {
@@ -135,13 +99,6 @@ Status GRPCStub::GetComputationGraphStats(
   });
 }
 
-Status GRPCStub::GetComputationShape(const GetComputationShapeRequest* request,
-                                     GetComputationShapeResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->GetComputationShape(context, *request, response);
-  });
-}
-
 Status GRPCStub::GetShape(const GetShapeRequest* request,
                           GetShapeResponse* response) {
   return MakeRPC([this, request, response](::grpc::ClientContext* context) {
@@ -163,48 +120,6 @@ Status GRPCStub::CreateChannelHandle(const CreateChannelHandleRequest* request,
   });
 }
 
-// Methods used by ComputationBuilder.
-Status GRPCStub::Computation(const ComputationRequest* request,
-                             ComputationResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->Computation(context, *request, response);
-  });
-}
-
-Status GRPCStub::Op(const OpRequest* request, OpResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->CreateOp(context, *request, response);
-  });
-}
-
-Status GRPCStub::GetLocalShape(const GetLocalShapeRequest* request,
-                               GetLocalShapeResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->GetLocalShape(context, *request, response);
-  });
-}
-
-Status GRPCStub::SetReturnValue(const SetReturnValueRequest* request,
-                                SetReturnValueResponse* responses) {
-  return MakeRPC([this, request, responses](::grpc::ClientContext* context) {
-    return grpc_stub_->SetReturnValue(context, *request, responses);
-  });
-}
-
-Status GRPCStub::IsConstant(const IsConstantRequest* request,
-                            IsConstantResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->IsConstant(context, *request, response);
-  });
-}
-
-Status GRPCStub::ComputeConstant(const ComputeConstantRequest* request,
-                                 ComputeConstantResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->ComputeConstant(context, *request, response);
-  });
-}
-
 Status GRPCStub::ComputeConstantGraph(
     const ComputeConstantGraphRequest* request,
     ComputeConstantResponse* response) {
@@ -213,14 +128,6 @@ Status GRPCStub::ComputeConstantGraph(
   });
 }
 
-// Methods used by Computation.
-Status GRPCStub::SnapshotComputation(const SnapshotComputationRequest* request,
-                                     SnapshotComputationResponse* response) {
-  return MakeRPC([this, request, response](::grpc::ClientContext* context) {
-    return grpc_stub_->SnapshotComputation(context, *request, response);
-  });
-}
-
 // Methods used by GlobalData.
 Status GRPCStub::Unregister(const UnregisterRequest* request,
                             UnregisterResponse* response) {
diff --git a/tensorflow/compiler/xla/rpc/grpc_stub.h b/tensorflow/compiler/xla/rpc/grpc_stub.h
index 5906d45769..8dfcb76138 100644
--- a/tensorflow/compiler/xla/rpc/grpc_stub.h
+++ b/tensorflow/compiler/xla/rpc/grpc_stub.h
@@ -43,39 +43,21 @@ class GRPCStub : public ServiceInterface {
   Status ResetDevice(const ResetDeviceRequest* arg,
                      ResetDeviceResponse* result) override;
 
-  Status LoadComputationSnapshot(
-      const LoadComputationSnapshotRequest* request,
-      LoadComputationSnapshotResponse* result) override;
-
-  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
-
   Status ExecuteGraph(const ExecuteGraphRequest* request,
                       ExecuteResponse* response) override;
 
-  Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                         ExecuteParallelResponse* result) override;
-
   Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* request,
                               ExecuteParallelResponse* response) override;
 
-  Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                      ExecuteAsyncResponse* result) override;
-
   Status WaitForExecution(const WaitForExecutionRequest* arg,
                           WaitForExecutionResponse* result) override;
 
   Status DeconstructTuple(const DeconstructTupleRequest* arg,
                           DeconstructTupleResponse* result) override;
 
-  Status GetComputationStats(const ComputationStatsRequest* arg,
-                             ComputationStatsResponse* result) override;
-
   Status GetComputationGraphStats(const ComputationGraphStatsRequest* request,
                                   ComputationStatsResponse* response) override;
 
-  Status GetComputationShape(const GetComputationShapeRequest* arg,
-                             GetComputationShapeResponse* result) override;
-
   Status GetShape(const GetShapeRequest* arg,
                   GetShapeResponse* result) override;
 
@@ -85,30 +67,9 @@ class GRPCStub : public ServiceInterface {
   Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
                              CreateChannelHandleResponse* result) override;
 
-  // Methods used by ComputationBuilder.
-  Status Computation(const ComputationRequest* arg,
-                     ComputationResponse* result) override;
-
-  Status Op(const OpRequest* arg, OpResponse* result) override;
-  Status GetLocalShape(const GetLocalShapeRequest* arg,
-                       GetLocalShapeResponse* result) override;
-
-  Status SetReturnValue(const SetReturnValueRequest* arg,
-                        SetReturnValueResponse* results) override;
-
-  Status IsConstant(const IsConstantRequest* arg,
-                    IsConstantResponse* result) override;
-
-  Status ComputeConstant(const ComputeConstantRequest* arg,
-                         ComputeConstantResponse* result) override;
-
   Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
                               ComputeConstantResponse* result) override;
 
-  // Methods used by Computation.
-  Status SnapshotComputation(const SnapshotComputationRequest* ag,
-                             SnapshotComputationResponse* result) override;
-
   // Methods used by GlobalData.
   Status Unregister(const UnregisterRequest* arg,
                     UnregisterResponse* result) override;
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index c47164ee1b..92eb19ec0f 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -75,19 +75,7 @@ service XlaService {
   rpc GetShape(GetShapeRequest) returns (GetShapeResponse) {
   }
 
-  // Requests the program shape of the referenced computation.
-  rpc GetComputationShape(GetComputationShapeRequest)
-      returns (GetComputationShapeResponse) {
-  }
-
   // Requests the statistics of the given computation.
-  rpc GetComputationStats(ComputationStatsRequest)
-      returns (ComputationStatsResponse) {
-  }
-
-  // Requests the statistics of the given computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   rpc GetComputationGraphStats(ComputationGraphStatsRequest)
       returns (ComputationStatsResponse) {
   }
@@ -121,15 +109,6 @@ service XlaService {
   rpc ResetDevice(ResetDeviceRequest) returns (ResetDeviceResponse) {
   }
 
-  // Tests if an expression is a compile-time constant.
-  rpc IsConstant(IsConstantRequest) returns (IsConstantResponse) {
-  }
-
-  // Computes the value of a constant expression.
-  rpc ComputeConstant(ComputeConstantRequest)
-      returns (ComputeConstantResponse) {
-  }
-
   // Computes the value of a constant expression. The request contains the
   // computation graph for the constant expression.
   rpc ComputeConstantGraph(ComputeConstantGraphRequest)
@@ -165,20 +144,6 @@ service XlaService {
   rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
   }
 
-  // Computation creates a new computation with the given name.
-  // A unique ComputationHandle is returned.
-  rpc Computation(ComputationRequest) returns (ComputationResponse) {
-  }
-
-  // Adds a new op to a computation.
-  rpc CreateOp(OpRequest) returns (OpResponse) {
-  }
-
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. Returns global data output and execution timing.
-  rpc Execute(ExecuteRequest) returns (ExecuteResponse) {
-  }
-
   // Invokes the provided computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
@@ -188,38 +153,13 @@ service XlaService {
   // Invokes the provided list of computations in parallel with the provided
   // global data for each computation. Returns a list of global data output and
   // execution timing.
-  rpc ExecuteParallel(ExecuteParallelRequest)
-      returns (ExecuteParallelResponse) {
-  }
-
-  // Invokes the provided list of computations in parallel with the provided
-  // global data for each computation. Returns a list of global data output and
-  // execution timing.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   rpc ExecuteGraphParallel(ExecuteGraphParallelRequest)
       returns (ExecuteParallelResponse) {
   }
 
-  // Invokes the provided computation with the provided global data passed as
-  // immutable arguments. Returns a handle to the execution.
-  rpc ExecuteAsync(ExecuteAsyncRequest) returns (ExecuteAsyncResponse) {
-  }
-
   // Waits until the given execution (aysnchronously launched) is complete, and
   // returns the global data output.
   rpc WaitForExecution(WaitForExecutionRequest)
       returns (WaitForExecutionResponse) {
   }
-
-  // Serializes a computation to proto form, so it can be loaded via
-  // LoadComputationSnapshot.
-  rpc SnapshotComputation(SnapshotComputationRequest)
-      returns (SnapshotComputationResponse) {
-  }
-
-  // Loads a computation from a captured snapshot.
-  rpc LoadComputationSnapshot(LoadComputationSnapshotRequest)
-      returns (LoadComputationSnapshotResponse) {
-  }
 }
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index d39fd7307a..c2e698a49f 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -104,56 +104,4 @@ CompileOnlyService::CompileAheadOfTime(
   return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
 }
 
-StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-CompileOnlyService::CompileAheadOfTime(
-    const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-    const AotCompilationOptions& options) {
-  std::vector<std::unique_ptr<HloModule>> hlo_modules;
-  for (const AotComputationInstance& instance : computations) {
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(instance.computation));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-
-    const DebugOptions& debug_options = options.debug_options();
-
-    // Dump computation proto state if flag is set.
-    const string& directory_path = debug_options.xla_dump_computations_to();
-    if (!directory_path.empty()) {
-      TF_ASSIGN_OR_RETURN(
-          std::unique_ptr<SessionModule> session_module,
-          computation_tracker_.SnapshotComputation(versioned_handle.handle));
-      string filename = tensorflow::strings::StrCat(
-          "computation_", versioned_handle.handle.handle(), "__",
-          session_module->entry().name(), "__version_",
-          versioned_handle.version);
-      const string& per_host_path = tensorflow::io::JoinPath(
-          directory_path, tensorflow::port::Hostname());
-
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(per_host_path, filename,
-                                                     *session_module));
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    ExecutionOptions execution_options;
-    *execution_options.mutable_debug_options() = debug_options;
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(*program_shape, instance.argument_layouts,
-                           &execution_options, user_computation));
-
-    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> hlo_module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle, *module_config,
-                            /*include_unreachable_instructions=*/true));
-    TF_RETURN_IF_ERROR(MaybeDumpHloModule(*hlo_module));
-    hlo_modules.push_back(std::move(hlo_module));
-  }
-
-  return compiler_->CompileAheadOfTime(std::move(hlo_modules), options);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compile_only_service.h b/tensorflow/compiler/xla/service/compile_only_service.h
index 7f2ce0e897..e6a66c202d 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.h
+++ b/tensorflow/compiler/xla/service/compile_only_service.h
@@ -38,24 +38,7 @@ class CompileOnlyService : public Service {
   static StatusOr<std::unique_ptr<CompileOnlyService>> NewService(
       const ServiceOptions& options);
 
-  // A description of a computation to compile using CompileAheadOfTime.
-  struct AotComputationInstance {
-    ComputationHandle computation;
-    std::vector<const Shape*> argument_layouts;
-    const Shape* result_layout = nullptr;
-  };
-
-  // Compiles a list of computations for ahead-of-time execution.  This is
-  // intended for use in static compilation.  See
-  // |CompileOnlyClient::CompileAheadOfTime| for additional details.
-  StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
-  CompileAheadOfTime(
-      const tensorflow::gtl::ArraySlice<AotComputationInstance> computations,
-      const AotCompilationOptions& Options);
-
   // A description of a xla computation to compile using CompileAheadOfTime.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   struct AotXlaComputationInstance {
     HloModuleProto computation;
     std::vector<const Shape*> argument_layouts;
@@ -65,31 +48,15 @@ class CompileOnlyService : public Service {
   // Compiles a list of xla computations for ahead-of-time execution.  This is
   // intended for use in static compilation.  See
   // |CompileOnlyClient::CompileAheadOfTime| for additional details.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
       const tensorflow::gtl::ArraySlice<AotXlaComputationInstance> computations,
       const AotCompilationOptions& options);
 
-  // Override Service methods that require or imply the existence of an
-  // execute backend.  Note that this does not include TransferToClient, as
-  // computing constants produces global data that we may wish to transfer.
-  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
-  Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                         ExecuteParallelResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
   Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                           GetDeviceHandlesResponse* result) override {
     return Unimplemented("CompileOnlyService does not support devices.");
   }
-  Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                      ExecuteAsyncResponse* result) override {
-    return Unimplemented("CompileOnlyService does not support execution.");
-  }
   Status WaitForExecution(const WaitForExecutionRequest* arg,
                           WaitForExecutionResponse* result) override {
     return Unimplemented("CompileOnlyService does not support execution.");
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index f54b52beae..968db7c76e 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -135,70 +135,6 @@ ExecutionOptions CreateExecutionOptions(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
-    const ComputationHandle& computation,
-    const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-    const ExecutableBuildOptions& build_options) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(computation));
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  // Validate incoming layouts.
-  if (argument_layouts.size() != program_shape->parameters_size()) {
-    return InvalidArgument(
-        "Invalid number of arguments for computation: expected %d, got %zu.",
-        program_shape->parameters_size(), argument_layouts.size());
-  }
-  for (int i = 0; i < argument_layouts.size(); ++i) {
-    const Shape& argument_shape = *argument_layouts[i];
-    TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(argument_shape));
-    if (!ShapeUtil::Compatible(argument_shape, program_shape->parameters(i))) {
-      tensorflow::gtl::optional<const OpMetadata*> metadata =
-          user_computation->ParameterMetadata(i);
-      auto metadata_string = [&metadata]() -> string {
-        if (!metadata.has_value()) {
-          return "";
-        }
-        CHECK(metadata.value() != nullptr);
-        const OpMetadata& m = *metadata.value();
-        if (!m.source_file().empty()) {
-          return tensorflow::strings::Printf(
-              " (%s:%d)", m.source_file().c_str(), m.source_line());
-        }
-        return "";
-      };
-      return InvalidArgument(
-          "Invalid argument shape for argument %d%s, expected %s, got %s.", i,
-          metadata_string().c_str(),
-          ShapeUtil::HumanString(program_shape->parameters(i)).c_str(),
-          ShapeUtil::HumanString(argument_shape).c_str());
-    }
-  }
-  if (build_options.result_layout() != nullptr) {
-    TF_RETURN_IF_ERROR(ValidateResultShapeWithLayout(
-        *build_options.result_layout(), program_shape->result()));
-  }
-
-  ExecutionOptions execution_options =
-      CreateExecutionOptions(build_options, program_shape.get());
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(*program_shape, argument_layouts,
-                                         &execution_options, user_computation));
-
-  TF_ASSIGN_OR_RETURN(
-      se::StreamExecutor * executor,
-      execute_backend_->stream_executor(build_options.device_ordinal()));
-
-  return BuildExecutable(versioned_handle, std::move(module_config),
-                         execute_backend_.get(), executor,
-                         build_options.device_allocator());
-}
-
 StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const XlaComputation& computation,
     const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index b55f119b3e..39d6734c3f 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -41,23 +41,11 @@ class LocalService : public Service {
   static StatusOr<std::unique_ptr<LocalService>> NewService(
       const ServiceOptions& options);
 
-  // Builds an Executable with the given argument layouts and options. If
-  // result_layout is non-null, then the executable is compiled to produce a
-  // result of the given layout.  If device_allocator is non-null, then the
-  // compiler may use it to allocate temp space on the device.  The compiler is
-  // responsible for freeing any memory it allocates this way.
-  StatusOr<std::unique_ptr<Executable>> CompileExecutable(
-      const ComputationHandle& computation,
-      const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
-      const ExecutableBuildOptions& options);
-
   // Builds an Executable with the given XlaComputation, argument layouts and
   // options. If result_layout is non-null, then the executable is compiled to
   // produce a result of the given layout.  If device_allocator is non-null,
   // then the compiler may use it to allocate temp space on the device.  The
   // compiler is responsible for freeing any memory it allocates this way.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Executable>> CompileExecutable(
       const XlaComputation& computation,
       const tensorflow::gtl::ArraySlice<const Shape*> argument_layouts,
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 5a813dcadc..79c098accb 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -195,20 +195,6 @@ Service::Service(const ServiceOptions& options,
   }
 }
 
-Status Service::Computation(const ComputationRequest* arg,
-                            ComputationResponse* result) {
-  if (arg->name().empty()) {
-    return InvalidArgument("computation request needs a name");
-  }
-
-  *result->mutable_computation() =
-      computation_tracker_.NewComputation(arg->name());
-  VLOG(1) << Printf("Created new computation %s on service %p, name %s",
-                    result->computation().ShortDebugString().c_str(), this,
-                    arg->name().c_str());
-  return Status::OK();
-}
-
 Status Service::CreateChannelHandle(const CreateChannelHandleRequest* arg,
                                     CreateChannelHandleResponse* result) {
   *result->mutable_channel() = channel_tracker_.NewChannel();
@@ -806,13 +792,6 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
                                                        result_tag);
 }
 
-Status Service::SetReturnValue(const SetReturnValueRequest* arg,
-                               SetReturnValueResponse* results) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-  return computation->SetReturnValue(arg->operand());
-}
-
 StatusOr<std::vector<se::StreamExecutor*>> Service::GetExecutors(
     const ExecutionOptions& execution_options, int64 requests_size,
     int64 request_index) const {
@@ -854,117 +833,6 @@ StatusOr<std::vector<std::vector<const ShapedBuffer*>>> Service::GetArguments(
   return replicated_arguments;
 }
 
-Status Service::ExecuteParallel(const ExecuteParallelRequest* arg,
-                                ExecuteParallelResponse* result) {
-  VLOG(1) << "running execute-parallel request: " << arg->ShortDebugString();
-
-  std::vector<std::vector<std::vector<const ShapedBuffer*>>> all_arguments;
-  std::vector<std::vector<se::StreamExecutor*>> all_executors;
-  std::vector<VersionedComputationHandle> versioned_handles;
-  std::vector<std::unique_ptr<HloModuleConfig>> module_configs;
-  std::vector<string> computation_names;
-  std::vector<DeviceHandle> device_handles;
-
-  int num_requested_devices =
-      std::accumulate(arg->requests().begin(), arg->requests().end(), 0,
-                      [](int a, const ExecuteRequest& r) -> int {
-                        return a + r.execution_options().device_handles_size();
-                      });
-  if (num_requested_devices * options_.number_of_replicas() >
-      execute_backend_->device_count()) {
-    return FailedPrecondition(
-        "there are not enough stream executors to execute %d computations",
-        num_requested_devices);
-  }
-
-  for (int64 i = 0; i < arg->requests_size(); ++i) {
-    // Get the stream executor for the i'th computation. This stream executor
-    // is one of the executors to run the replicated computation.
-    const ExecutionOptions& execution_options =
-        arg->requests(i).execution_options();
-
-    // Get the executors.
-    TF_ASSIGN_OR_RETURN(auto executors, GetExecutors(execution_options,
-                                                     arg->requests_size(), i));
-
-    // Resolve the UserComputation object associated with the requested
-    // computation and compute the program shape.
-    const ExecuteRequest& request = arg->requests(i);
-    TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                        computation_tracker_.Resolve(request.computation()));
-    VersionedComputationHandle versioned_handle =
-        user_computation->GetVersionedHandle();
-    if (user_computation->request_count(versioned_handle.version) == 0) {
-      return InvalidArgument("computations may not be empty");
-    }
-
-    TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        user_computation->ComputeProgramShape(versioned_handle.version));
-
-    // Get the replicated arguments.
-    TF_ASSIGN_OR_RETURN(auto replicated_arguments,
-                        GetArguments(execution_options, request.arguments()));
-
-    // Create an HloModuleConfig object for the computation, given the shape of
-    // the program and the argument allocations. Here, we care only about the
-    // shapes of the arguments, so, it is sufficient to use the arguments of
-    // replica 0.
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                           request.execution_options(), user_computation));
-    VLOG(3) << "ExecuteParallel created HloModuleConfig computation layout: "
-            << module_config->host_entry_computation_layout().ToString();
-
-    // Adds to the vectors to build and execute the computations after the loop.
-    all_arguments.push_back(replicated_arguments);
-    all_arguments.insert(all_arguments.end(), executors.size() - 1, {{}});
-    versioned_handles.push_back(versioned_handle);
-    module_configs.push_back(std::move(module_config));
-    computation_names.insert(computation_names.end(), executors.size(),
-                             user_computation->name());
-    all_executors.push_back(executors);
-    device_handles.insert(device_handles.end(),
-                          execution_options.device_handles().begin(),
-                          execution_options.device_handles().end());
-  }
-
-  // Build the user computations into HloModules and compile to generate the
-  // executables.
-  //
-  // TODO(jlebar): There's currently no way to pass a device allocator to
-  // ExecuteParallel, so we have to pass a null device_allocator below.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<Executable>> executables,
-      BuildExecutables(versioned_handles, std::move(module_configs),
-                       execute_backend_.get(), all_executors,
-                       /*device_allocator=*/nullptr));
-  std::vector<Executable*> executable_ptrs;
-  executable_ptrs.reserve(executables.size());
-  for (const auto& executable : executables) {
-    executable_ptrs.push_back(executable.get());
-  }
-
-  // Execute the generated executables in parallel and return the device
-  // handles for each computation's output.
-  ExecutionProfile profile;
-  TF_ASSIGN_OR_RETURN(
-      std::vector<GlobalDataHandle> outputs,
-      ExecuteParallelAndRegisterResult(executable_ptrs, all_arguments,
-                                       execute_backend_.get(), device_handles,
-                                       computation_names, &profile));
-  for (const GlobalDataHandle& output : outputs) {
-    ExecuteResponse response;
-    *response.mutable_output() = output;
-    *response.mutable_profile() = profile;
-    *result->add_responses() = response;
-  }
-
-  VLOG(1) << "successfully completed 'execute-parallel' request";
-  return Status::OK();
-}
-
 Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                                      ExecuteParallelResponse* result) {
   VLOG(1) << "running execute-graph-parallel request";
@@ -1090,15 +958,6 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   return Status::OK();
 }
 
-Status Service::ExecuteOneToN(const ExecuteRequest* arg,
-                              ExecuteResponse* result) {
-  ExecuteParallelRequest parallel_arg;
-  *parallel_arg.add_requests() = *arg;
-  ExecuteParallelResponse parallel_result;
-  TF_RETURN_IF_ERROR(ExecuteParallel(&parallel_arg, &parallel_result));
-  return PickParallelResponse(parallel_result, result);
-}
-
 Status Service::ExecuteOneToN(const ExecuteGraphRequest* arg,
                               ExecuteResponse* result) {
   ExecuteGraphParallelRequest parallel_arg;
@@ -1131,80 +990,6 @@ Status Service::PickParallelResponse(
   return Status::OK();
 }
 
-Status Service::Execute(const ExecuteRequest* arg, ExecuteResponse* result) {
-  VLOG(1) << "running execute request: " << arg->ShortDebugString();
-
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  // If we received multiple device handles, we must partition the module.
-  if (arg->execution_options().device_handles_size() > 1) {
-    return ExecuteOneToN(arg, result);
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
-  // Since we care only about the shapes of the arguments, it is sufficient to
-  // use the arguments of replica 0.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                         arg->execution_options(), user_computation));
-
-  VLOG(3) << "Execute created HloModuleConfig computation layout: "
-          << module_config->host_entry_computation_layout().ToString();
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(versioned_handle, std::move(module_config),
-                              execute_backend_.get(),
-                              execute_backend_->default_stream_executor(),
-                              result->mutable_profile()));
-
-  if (executable->dumping()) {
-    executable->session_module()->set_execution_platform(
-        execute_backend_->platform()->Name());
-    TF_RETURN_IF_ERROR(RecordArguments(
-        replicated_arguments.front(),
-        execute_backend_->default_stream_executor(),
-        execute_backend_->transfer_manager(), executable->session_module()));
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      *result->mutable_output(),
-      ExecuteAndRegisterResult(
-          executable.get(), replicated_arguments, execute_backend_.get(),
-          "result of " + user_computation->name(), result->mutable_profile()));
-
-  if (executable->dumping()) {
-    TF_ASSIGN_OR_RETURN(
-        const ShapedBuffer* result_buffer,
-        allocation_tracker_.ResolveForReplica(result->output(), 0));
-    TF_RETURN_IF_ERROR(RecordResult(
-        *result_buffer, execute_backend_->default_stream_executor(),
-        execute_backend_->transfer_manager(), executable->session_module()));
-    TF_RETURN_IF_ERROR(executable->DumpSessionModule());
-  }
-
-  VLOG(1) << "successfully completed 'execute' request";
-  return Status::OK();
-}
-
 StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
     const HloModuleProto& module_proto,
     std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -1310,86 +1095,6 @@ Status Service::ExecuteGraph(const ExecuteGraphRequest* arg,
   return Status::OK();
 }
 
-Status Service::ExecuteAsync(const ExecuteAsyncRequest* arg,
-                             ExecuteAsyncResponse* result) {
-  VLOG(1) << "running execute-async request: " << arg->ShortDebugString();
-
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> program_shape,
-      user_computation->ComputeProgramShape(versioned_handle.version));
-
-  TF_ASSIGN_OR_RETURN(auto replicas, Replicas(*execute_backend_,
-                                              SingleComputationDeviceHandle()));
-  TF_RET_CHECK(!replicas.empty());
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::vector<const ShapedBuffer*>> replicated_arguments,
-      ResolveAndValidateArguments(arg->arguments(), replicas));
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(*program_shape, replicated_arguments.front(),
-                         arg->execution_options(), user_computation));
-
-  VLOG(3) << "ExecuteAsync created HloModuleConfig computation layout: "
-          << module_config->host_entry_computation_layout().ToString();
-
-  ExecutionProfile profile;
-
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<Executable> executable,
-      BuildAndCacheExecutable(
-          versioned_handle, std::move(module_config), execute_backend_.get(),
-          execute_backend_->default_stream_executor(), &profile));
-
-  // Set up streams.
-  std::vector<Pool<se::Stream>::SmartPtr> streams;
-  for (se::StreamExecutor* executor : replicas) {
-    TF_ASSIGN_OR_RETURN(Pool<se::Stream>::SmartPtr stream,
-                        execute_backend_->BorrowStream(executor));
-    streams.push_back(std::move(stream));
-  }
-
-  std::vector<ScopedShapedBuffer> result_buffers;
-  for (size_t i = 0; i < streams.size(); ++i) {
-    const auto& stream = streams[i];
-    ExecutableRunOptions options;
-    options.set_stream(stream.get());
-    options.set_allocator(execute_backend_->memory_allocator());
-    options.set_intra_op_thread_pool(
-        execute_backend_->eigen_intra_op_thread_pool_device());
-
-    ServiceExecutableRunOptions service_options(
-        options, execute_backend_->StreamBorrower());
-
-    TF_ASSIGN_OR_RETURN(ScopedShapedBuffer this_result_buffer,
-                        executable->ExecuteAsyncOnStream(
-                            &service_options, replicated_arguments[i]));
-
-    result_buffers.emplace_back(std::move(this_result_buffer));
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      GlobalDataHandle output,
-      allocation_tracker_.RegisterReplicatedBuffers(
-          std::move(result_buffers), "result of " + user_computation->name()));
-
-  *result->mutable_execution() = execution_tracker_.Register(
-      execute_backend_.get(), std::move(streams), profile, output);
-  streams.clear();
-
-  VLOG(1) << "successfully completed 'execute-async' request";
-  return Status::OK();
-}
-
 Status Service::WaitForExecution(const WaitForExecutionRequest* arg,
                                  WaitForExecutionResponse* result) {
   TF_ASSIGN_OR_RETURN(const auto execution,
@@ -1556,117 +1261,6 @@ Status Service::ResetDevice(const ResetDeviceRequest* arg,
   return execute_backend_->ResetDevices();
 }
 
-Status Service::IsConstant(const IsConstantRequest* arg,
-                           IsConstantResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandleAtOperation(arg->operand());
-
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      bool is_constant,
-      user_computation->IsConstant(arg->operand(), arg->num_parameters()));
-
-  result->set_is_constant(is_constant);
-  return Status::OK();
-}
-
-Status Service::ComputeConstant(const ComputeConstantRequest* arg,
-                                ComputeConstantResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandleAtOperation(arg->operand());
-
-  if (user_computation->request_count(versioned_handle.version) == 0) {
-    return InvalidArgument("computations may not be empty");
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      bool is_constant,
-      user_computation->IsConstant(arg->operand(), arg->parameters_size()));
-  if (!is_constant) {
-    StatusOr<const OperationRequest*> op_request_status =
-        user_computation->LookUpRequestForErrorReporting(arg->operand());
-    string op_request_string = "<unknown operation>";
-    if (op_request_status.ok()) {
-      op_request_string = op_request_status.ValueOrDie()->ShortDebugString();
-    }
-    return InvalidArgument(
-        "Operand to ComputeConstant depends on a parameter.\n\n"
-        "  op requested for constant evaluation: %s\n\n"
-        "This is an internal error that typically happens when the XLA user "
-        "(e.g. TensorFlow) is attempting to determine a value that must be a "
-        "compile-time constant (e.g. an array dimension) but it is not capable "
-        "of being evaluated at XLA compile time.\n\n"
-        "Please file a usability bug with the framework being used (e.g. "
-        "TensorFlow).",
-        op_request_string.c_str());
-  }
-
-  // We can't use ComputeProgramShape because it checks that all parameter
-  // instructions are present and contiguous. Instead construct ProgramShape
-  // directly.
-  ProgramShape program_shape;
-  TF_ASSIGN_OR_RETURN(*program_shape.mutable_result(),
-                      user_computation->GetShape(arg->operand()));
-
-  TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
-
-  ExecutionOptions execution_options = xla::CreateDefaultExecutionOptions();
-  execution_options.mutable_debug_options()->set_xla_enable_fast_math(false);
-  execution_options.mutable_debug_options()
-      ->set_xla_eliminate_hlo_implicit_broadcast(true);
-  *execution_options.mutable_shape_with_output_layout() =
-      program_shape.result();
-
-  Shape shape_with_output_layout(program_shape.result());
-  if (arg->has_output_layout()) {
-    TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
-        arg->output_layout(), execution_options.shape_with_output_layout()));
-    *execution_options.mutable_shape_with_output_layout()->mutable_layout() =
-        arg->output_layout();
-  }
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModuleConfig> module_config,
-                      CreateModuleConfig(program_shape, {}, execution_options,
-                                         user_computation));
-
-  // Exclude dead parameter instructions for the purpose of computing constants.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, *module_config,
-                                          /*include_unreachable_instructions=*/
-                                          false));
-
-  std::vector<std::unique_ptr<Literal>> parameters(arg->parameters_size());
-  for (int64 i = 0; i < arg->parameters_size(); ++i) {
-    TF_ASSIGN_OR_RETURN(parameters[i],
-                        Literal::CreateFromProto(arg->parameters(i)));
-  }
-  HloEvaluator evaluator;
-  TF_ASSIGN_OR_RETURN(
-      auto result_literal,
-      evaluator.Evaluate<std::unique_ptr<Literal>>(*module, parameters));
-
-  // Since the shape_with_output_layout option in ExecutionOption is
-  // non-effective to the Evaluator results, explicit relayout here.
-  //
-  // TODO(b/77824332): Make HloEvaluator take care of the re-layout.
-  if (arg->has_output_layout()) {
-    result_literal = result_literal->Relayout(arg->output_layout());
-  }
-  *result->mutable_literal() = result_literal->ToProto();
-
-  return Status::OK();
-}
-
 Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
                                      ComputeConstantResponse* result) {
   if (!arg->has_computation()) {
@@ -1716,60 +1310,6 @@ Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
   return Status::OK();
 }
 
-Status Service::GetComputationShape(const GetComputationShapeRequest* arg,
-                                    GetComputationShapeResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      computation->GetVersionedHandle();
-
-  TF_ASSIGN_OR_RETURN(auto program_shape, computation->ComputeProgramShape(
-                                              versioned_handle.version));
-  *result->mutable_program_shape() = *program_shape;
-  return Status::OK();
-}
-
-Status Service::GetLocalShape(const GetLocalShapeRequest* arg,
-                              GetLocalShapeResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  TF_ASSIGN_OR_RETURN(*result->mutable_shape(),
-                      computation->GetShape(arg->operand()));
-  return Status::OK();
-}
-
-Status Service::GetComputationStats(const ComputationStatsRequest* arg,
-                                    ComputationStatsResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  VersionedComputationHandle versioned_handle =
-      user_computation->GetVersionedHandle();
-
-  HloModuleConfig config;
-  config.set_debug_options(arg->debug_options());
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, config));
-
-  hlo_graph_dumper::MaybeDumpHloModule(*module,
-                                       "computation statistics subject");
-
-  // Run HLO analysis to get the computation statistics.
-  HloCostAnalysis analysis(
-      execute_backend_->compiler()->ShapeSizeBytesFunction());
-
-  TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis));
-
-  ComputationStats stats;
-  stats.set_flop_count(analysis.flop_count());
-  stats.set_transcendental_count(analysis.transcendental_count());
-  *result->mutable_stats() = stats;
-  return Status::OK();
-}
-
 Status Service::GetComputationGraphStats(
     const ComputationGraphStatsRequest* arg, ComputationStatsResponse* result) {
   if (!arg->has_computation()) {
@@ -1812,250 +1352,6 @@ Status Service::AddInstruction(
   return Status::OK();
 }
 
-Status Service::Op(const OpRequest* arg, OpResponse* result) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-  StatusOr<ComputationDataHandle> handle_status;
-
-  switch (arg->op_case()) {
-    case OpRequest::kBatchNormTrainingRequest:
-      handle_status = computation->AddBatchNormTrainingInstruction(
-          arg->batch_norm_training_request());
-      break;
-    case OpRequest::kBatchNormInferenceRequest:
-      handle_status = computation->AddBatchNormInferenceInstruction(
-          arg->batch_norm_inference_request());
-      break;
-    case OpRequest::kBatchNormGradRequest:
-      handle_status = computation->AddBatchNormGradInstruction(
-          arg->batch_norm_grad_request());
-      break;
-    case OpRequest::kBinaryOpRequest:
-      handle_status =
-          computation->AddBinaryInstruction(arg->binary_op_request());
-      break;
-    case OpRequest::kBroadcastRequest:
-      handle_status =
-          computation->AddBroadcastInstruction(arg->broadcast_request());
-      break;
-    case OpRequest::kCallRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * to_apply,
-          computation_tracker_.Resolve(arg->call_request().to_apply()));
-      handle_status =
-          computation->AddCallInstruction(arg->call_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kConcatenateRequest:
-      handle_status =
-          computation->AddConcatenateInstruction(arg->concatenate_request());
-      break;
-    case OpRequest::kConditionalRequest: {
-      TF_ASSIGN_OR_RETURN(UserComputation * true_computation,
-                          computation_tracker_.Resolve(
-                              arg->conditional_request().true_computation()));
-      TF_ASSIGN_OR_RETURN(UserComputation * false_computation,
-                          computation_tracker_.Resolve(
-                              arg->conditional_request().false_computation()));
-      handle_status = computation->AddConditionalInstruction(
-          arg->conditional_request(), *true_computation, *false_computation);
-      break;
-    }
-    case OpRequest::kConstantRequest:
-      handle_status =
-          computation->AddConstantInstruction(arg->constant_request());
-      break;
-    case OpRequest::kConvertRequest:
-      handle_status =
-          computation->AddConvertInstruction(arg->convert_request());
-      break;
-    case OpRequest::kBitcastConvertRequest:
-      handle_status = computation->AddBitcastConvertInstruction(
-          arg->bitcast_convert_request());
-      break;
-    case OpRequest::kConvolveRequest:
-      handle_status =
-          computation->AddConvolveInstruction(arg->convolve_request());
-      break;
-    case OpRequest::kCrossReplicaSumRequest:
-      handle_status = computation->AddCrossReplicaSumInstruction(
-          arg->cross_replica_sum_request());
-      break;
-    case OpRequest::kCustomCallRequest:
-      handle_status =
-          computation->AddCustomCallInstruction(arg->custom_call_request());
-      break;
-    case OpRequest::kDotRequest:
-      handle_status = computation->AddDotInstruction(arg->dot_request());
-      break;
-    case OpRequest::kDynamicSliceRequest:
-      handle_status =
-          computation->AddDynamicSliceInstruction(arg->dynamic_slice_request());
-      break;
-    case OpRequest::kDynamicUpdateSliceRequest:
-      handle_status = computation->AddDynamicUpdateSliceInstruction(
-          arg->dynamic_update_slice_request());
-      break;
-    case OpRequest::kFftRequest:
-      handle_status = computation->AddFftInstruction(arg->fft_request());
-      break;
-    case OpRequest::kGatherRequest:
-      handle_status = computation->AddGatherInstruction(arg->gather_request());
-      break;
-    case OpRequest::kGetTupleElementRequest:
-      handle_status = computation->AddGetTupleElementInstruction(
-          arg->get_tuple_element_request());
-      break;
-    case OpRequest::kInfeedRequest:
-      handle_status = computation->AddInfeedInstruction(arg->infeed_request());
-      break;
-    case OpRequest::kOutfeedRequest:
-      handle_status =
-          computation->AddOutfeedInstruction(arg->outfeed_request());
-      break;
-    case OpRequest::kHostComputeRequest:
-      handle_status =
-          computation->AddHostComputeInstruction(arg->host_compute_request());
-      break;
-    case OpRequest::kMapRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * to_apply,
-          computation_tracker_.Resolve(arg->map_request().to_apply()));
-      handle_status =
-          computation->AddMapInstruction(arg->map_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kPadRequest:
-      handle_status = computation->AddPadInstruction(arg->pad_request());
-      break;
-    case OpRequest::kParameterRequest:
-      handle_status =
-          computation->AddParameterInstruction(arg->parameter_request());
-      break;
-    case OpRequest::kReduceRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * to_apply,
-          computation_tracker_.Resolve(arg->reduce_request().to_apply()));
-      handle_status =
-          computation->AddReduceInstruction(arg->reduce_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kReducePrecisionRequest: {
-      handle_status = computation->AddReducePrecisionInstruction(
-          arg->reduce_precision_request());
-      break;
-    }
-    case OpRequest::kReduceWindowRequest: {
-      TF_ASSIGN_OR_RETURN(UserComputation * to_apply,
-                          computation_tracker_.Resolve(
-                              arg->reduce_window_request().to_apply()));
-      handle_status = computation->AddReduceWindowInstruction(
-          arg->reduce_window_request(), *to_apply);
-      break;
-    }
-    case OpRequest::kReshapeRequest:
-      handle_status =
-          computation->AddReshapeInstruction(arg->reshape_request());
-      break;
-    case OpRequest::kReverseRequest:
-      handle_status =
-          computation->AddReverseInstruction(arg->reverse_request());
-      break;
-    case OpRequest::kRngRequest:
-      handle_status = computation->AddRngInstruction(arg->rng_request());
-      break;
-    case OpRequest::kSelectAndScatterRequest: {
-      TF_ASSIGN_OR_RETURN(UserComputation * select,
-                          computation_tracker_.Resolve(
-                              arg->select_and_scatter_request().select()));
-      TF_ASSIGN_OR_RETURN(UserComputation * scatter,
-                          computation_tracker_.Resolve(
-                              arg->select_and_scatter_request().scatter()));
-      handle_status = computation->AddSelectAndScatterInstruction(
-          arg->select_and_scatter_request(), *select, *scatter);
-      break;
-    }
-    case OpRequest::kSliceRequest:
-      handle_status = computation->AddSliceInstruction(arg->slice_request());
-      break;
-    case OpRequest::kTernaryOpRequest:
-      handle_status =
-          computation->AddTernaryInstruction(arg->ternary_op_request());
-      break;
-    case OpRequest::kTraceRequest:
-      return computation->AddTraceInstruction(arg->trace_request());
-    case OpRequest::kTransposeRequest:
-      handle_status =
-          computation->AddTransposeInstruction(arg->transpose_request());
-      break;
-    case OpRequest::kUnaryOpRequest:
-      handle_status = computation->AddUnaryInstruction(arg->unary_op_request());
-      break;
-    case OpRequest::kVariadicOpRequest:
-      handle_status =
-          computation->AddVariadicInstruction(arg->variadic_op_request());
-      break;
-    case OpRequest::kWhileRequest: {
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * condition,
-          computation_tracker_.Resolve(arg->while_request().condition()));
-      TF_ASSIGN_OR_RETURN(
-          UserComputation * body,
-          computation_tracker_.Resolve(arg->while_request().body()));
-      handle_status = computation->AddWhileInstruction(arg->while_request(),
-                                                       *condition, *body);
-      break;
-    }
-    case OpRequest::kSendRequest: {
-      TF_RETURN_IF_ERROR(
-          channel_tracker_.RegisterSend(arg->send_request().channel_handle()));
-      // Send does not return a value, but we need a handle to be able to
-      // set OpMetadata and OpSharding (device assignment).
-      handle_status = computation->AddSendInstruction(arg->send_request());
-      break;
-    }
-    case OpRequest::kRecvRequest: {
-      TF_RETURN_IF_ERROR(
-          channel_tracker_.RegisterRecv(arg->recv_request().channel_handle()));
-      handle_status = computation->AddRecvInstruction(arg->recv_request());
-      break;
-    }
-    case OpRequest::OP_NOT_SET:
-      return InvalidArgument("XLA service received OpRequest with OP_NOT_SET");
-    default:
-      return InvalidArgument("Unsupported operation in XLA service");
-  }
-  TF_ASSIGN_OR_RETURN(*result->mutable_output(), handle_status);
-
-  // We set the debug metadata here, because we slice off part of the OpRequest
-  // proto in the above switch statement.
-  TF_ASSIGN_OR_RETURN(ComputationDataHandle handle, handle_status);
-  TF_RETURN_IF_ERROR(computation->SetOpMetadata(handle, arg->metadata()));
-  if (arg->has_sharding()) {
-    TF_RETURN_IF_ERROR(computation->SetOpSharding(handle, arg->sharding()));
-  }
-  return Status::OK();
-}
-
-Status Service::SnapshotComputation(const SnapshotComputationRequest* arg,
-                                    SnapshotComputationResponse* result) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<SessionModule> module,
-      computation_tracker_.SnapshotComputation(arg->computation()));
-
-  result->set_allocated_module(module.release());
-
-  return Status::OK();
-}
-
-Status Service::LoadComputationSnapshot(
-    const LoadComputationSnapshotRequest* arg,
-    LoadComputationSnapshotResponse* result) {
-  TF_ASSIGN_OR_RETURN(*result->mutable_computation(),
-                      computation_tracker_.LoadSessionModule(arg->module()));
-  return Status::OK();
-}
-
 DeviceHandle Service::SingleComputationDeviceHandle() const {
   DeviceHandle device_handle;
   device_handle.set_handle(0);
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 81fbd41957..b3c0eac9da 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -83,11 +83,6 @@ class Service : public ServiceInterface {
   static StatusOr<std::unique_ptr<Service>> NewService(
       const ServiceOptions& options);
 
-  // Creates a new computation with the given name.
-  // A unique ComputationHandle is returned.
-  Status Computation(const ComputationRequest* arg,
-                     ComputationResponse* result) override;
-
   // Unregisters a previously-allocated global handle.
   //
   // If the handle given is not currently allocated, a NOT_FOUND status is
@@ -100,35 +95,15 @@ class Service : public ServiceInterface {
   Status DeconstructTuple(const DeconstructTupleRequest* arg,
                           DeconstructTupleResponse* result) override;
 
-  // Modifies the provided computation so that subsequent executions
-  // will compute the provided ComputationDataHandle, rather than the
-  // last expression enqueued on that Computation.
-  Status SetReturnValue(const SetReturnValueRequest* arg,
-                        SetReturnValueResponse* results) override;
-
-  // Executes a computation with the provided global data passed as
-  // immutable arguments. Returns global data output and execution timing.
-  Status Execute(const ExecuteRequest* arg, ExecuteResponse* result) override;
-
   // Executes a computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   Status ExecuteGraph(const ExecuteGraphRequest* arg,
                       ExecuteResponse* result) override;
 
   // Executes one or more computations in parallel with the provided global data
   // passed as immutable arguments. Returns global data output for each
   // computation.
-  Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                         ExecuteParallelResponse* result) override;
-
-  // Executes one or more computations in parallel with the provided global data
-  // passed as immutable arguments. Returns global data output for each
-  // computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                               ExecuteParallelResponse* result) override;
 
@@ -143,16 +118,6 @@ class Service : public ServiceInterface {
   Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                           GetDeviceHandlesResponse* result) override;
 
-  // Asynchronously executes a computation with provided arguments. Invokes
-  // the provided computation with the provided global data passed as
-  // immutable arguments. Returns a handle to the execution.
-  //
-  // (Note: The corresponding function in xla::Client was removed as part of
-  // b/64116060, in an attempt to simplify our API.  We're keeping this around
-  // for now in case we want to expose this to clients in a different way.)
-  Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                      ExecuteAsyncResponse* result) override;
-
   // Waits until the specified execution is complete and returns the result.
   // Calling this API multiple times with the same execution handle returns the
   // method with an error since the execution handle is destroyed after the
@@ -190,13 +155,6 @@ class Service : public ServiceInterface {
   Status ResetDevice(const ResetDeviceRequest* arg,
                      ResetDeviceResponse* result) override;
 
-  // Tests if an expression is a compile-time constant.
-  Status IsConstant(const IsConstantRequest* arg,
-                    IsConstantResponse* result) override;
-
-  // Computes the value of a constant expression.
-  Status ComputeConstant(const ComputeConstantRequest* arg,
-                         ComputeConstantResponse* result) override;
   Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
                               ComputeConstantResponse* result) override;
 
@@ -205,43 +163,10 @@ class Service : public ServiceInterface {
   Status GetShape(const GetShapeRequest* arg,
                   GetShapeResponse* result) override;
 
-  // Returns the program shape of the computation associated with the given
-  // handle.
-  Status GetComputationShape(const GetComputationShapeRequest* arg,
-                             GetComputationShapeResponse* result) override;
-
-  /////
-  // Computation-oriented methods.
-
-  // Enqueues an Op on the computation.
-  Status Op(const OpRequest* arg, OpResponse* result) override;
-
-  // Retrieves the inferred shape for a value within a computation.
-  Status GetLocalShape(const GetLocalShapeRequest* arg,
-                       GetLocalShapeResponse* result) override;
-
   // Retrieves the statistics of a computation.
-  Status GetComputationStats(const ComputationStatsRequest* arg,
-                             ComputationStatsResponse* result) override;
-
-  // Retrieves the statistics of a computation.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   Status GetComputationGraphStats(const ComputationGraphStatsRequest* arg,
                                   ComputationStatsResponse* result) override;
 
-  // Snapshots the current state of a computation handle into a serializable
-  // protocol buffer form, so it can be loaded via
-  // LoadComputationSnapshot.
-  Status SnapshotComputation(const SnapshotComputationRequest* arg,
-                             SnapshotComputationResponse* result) override;
-
-  // Loads a computation from a serialized protocol buffer created via
-  // SnapshotComputation.
-  Status LoadComputationSnapshot(
-      const LoadComputationSnapshotRequest* arg,
-      LoadComputationSnapshotResponse* result) override;
-
   // Creates a unique channel handle that can be used for Send/Recv
   // instructions.
   Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
@@ -382,7 +307,6 @@ class Service : public ServiceInterface {
   // Executes a single computation which has more than one target device.
   // The N devices are expected to all return an empty tuple, but one, which
   // will be the result of this computation.
-  Status ExecuteOneToN(const ExecuteRequest* arg, ExecuteResponse* result);
   Status ExecuteOneToN(const ExecuteGraphRequest* arg, ExecuteResponse* result);
 
   // Convenience function which checks whether the given shape_with_layout
diff --git a/tensorflow/compiler/xla/service_interface.h b/tensorflow/compiler/xla/service_interface.h
index 141347a792..14c35e7b84 100644
--- a/tensorflow/compiler/xla/service_interface.h
+++ b/tensorflow/compiler/xla/service_interface.h
@@ -47,41 +47,22 @@ class ServiceInterface {
   virtual Status ResetDevice(const ResetDeviceRequest* arg,
                              ResetDeviceResponse* result) = 0;
 
-  virtual Status LoadComputationSnapshot(
-      const LoadComputationSnapshotRequest* request,
-      LoadComputationSnapshotResponse* result) = 0;
-
-  virtual Status Execute(const ExecuteRequest* arg,
-                         ExecuteResponse* result) = 0;
-
   virtual Status ExecuteGraph(const ExecuteGraphRequest* arg,
                               ExecuteResponse* result) = 0;
 
-  virtual Status ExecuteParallel(const ExecuteParallelRequest* arg,
-                                 ExecuteParallelResponse* result) = 0;
-
   virtual Status ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
                                       ExecuteParallelResponse* result) = 0;
 
-  virtual Status ExecuteAsync(const ExecuteAsyncRequest* arg,
-                              ExecuteAsyncResponse* result) = 0;
-
   virtual Status WaitForExecution(const WaitForExecutionRequest* arg,
                                   WaitForExecutionResponse* result) = 0;
 
   virtual Status DeconstructTuple(const DeconstructTupleRequest* arg,
                                   DeconstructTupleResponse* result) = 0;
 
-  virtual Status GetComputationStats(const ComputationStatsRequest* arg,
-                                     ComputationStatsResponse* result) = 0;
-
   virtual Status GetComputationGraphStats(
       const ComputationGraphStatsRequest* arg,
       ComputationStatsResponse* result) = 0;
 
-  virtual Status GetComputationShape(const GetComputationShapeRequest* arg,
-                                     GetComputationShapeResponse* result) = 0;
-
   virtual Status GetShape(const GetShapeRequest* arg,
                           GetShapeResponse* result) = 0;
 
@@ -91,31 +72,9 @@ class ServiceInterface {
   virtual Status GetDeviceHandles(const GetDeviceHandlesRequest* arg,
                                   GetDeviceHandlesResponse* result) = 0;
 
-  // Methods used by ComputationBuilder.
-  virtual Status Computation(const ComputationRequest* arg,
-                             ComputationResponse* result) = 0;
-
-  virtual Status Op(const OpRequest* arg, OpResponse* result) = 0;
-
-  virtual Status GetLocalShape(const GetLocalShapeRequest* arg,
-                               GetLocalShapeResponse* result) = 0;
-
-  virtual Status SetReturnValue(const SetReturnValueRequest* arg,
-                                SetReturnValueResponse* results) = 0;
-
-  virtual Status IsConstant(const IsConstantRequest* arg,
-                            IsConstantResponse* result) = 0;
-
-  virtual Status ComputeConstant(const ComputeConstantRequest* arg,
-                                 ComputeConstantResponse* result) = 0;
-
   virtual Status ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
                                       ComputeConstantResponse* result) = 0;
 
-  // Methods used by Computation.
-  virtual Status SnapshotComputation(const SnapshotComputationRequest* ag,
-                                     SnapshotComputationResponse* result) = 0;
-
   // Methods used by GlobalData.
   virtual Status Unregister(const UnregisterRequest* arg,
                             UnregisterResponse* result) = 0;
-- 
GitLab


From 7e2e57410eb40c0512dc573955fd256a6c787741 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 06:05:04 -0700
Subject: [PATCH 095/610] implementation of sparse_to_dense

PiperOrigin-RevId: 198710452
---
 tensorflow/contrib/lite/build_def.bzl         |   1 +
 tensorflow/contrib/lite/builtin_op_data.h     |   4 +
 tensorflow/contrib/lite/builtin_ops.h         |   1 +
 .../lite/g3doc/tf_ops_compatibility.md        |  15 +
 tensorflow/contrib/lite/kernels/BUILD         |  14 +
 .../internal/reference/reference_ops.h        |  36 +++
 tensorflow/contrib/lite/kernels/register.cc   |   2 +
 .../contrib/lite/kernels/sparse_to_dense.cc   | 275 ++++++++++++++++++
 .../lite/kernels/sparse_to_dense_test.cc      | 155 ++++++++++
 tensorflow/contrib/lite/model.cc              |  10 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   1 +
 tensorflow/contrib/lite/schema/schema.fbs     |   6 +
 .../contrib/lite/schema/schema_generated.h    | 141 ++++++++-
 .../contrib/lite/testing/generate_examples.py |  77 ++++-
 .../contrib/lite/toco/export_tensorflow.cc    |  19 ++
 .../propagate_array_data_types.cc             |  10 +
 .../propagate_fixed_sizes.cc                  |  32 ++
 .../contrib/lite/toco/import_tensorflow.cc    |  20 ++
 tensorflow/contrib/lite/toco/model.h          |  14 +
 .../contrib/lite/toco/tflite/operator.cc      |  23 ++
 .../contrib/lite/toco/tflite/operator_test.cc |   9 +
 tensorflow/contrib/lite/toco/tooling_util.cc  |   1 +
 22 files changed, 859 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/sparse_to_dense.cc
 create mode 100644 tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index c8820ab29b..b9e40cc50c 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -239,6 +239,7 @@ def generated_test_models():
         "softmax",
         "space_to_batch_nd",
         "space_to_depth",
+        "sparse_to_dense",
         "split",
         "squeeze",
         "strided_slice",
diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 8660c653ae..52ab9ee640 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -236,6 +236,10 @@ typedef struct {
   int stride_height;
 } TfLiteTransposeConvParams;
 
+typedef struct {
+  bool validate_indices;
+} TfLiteSparseToDenseParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 24a9b0f6b8..c797e3589a 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -93,6 +93,7 @@ typedef enum {
   kTfLiteBuiltinSlice = 65,
   kTfLiteBuiltinSin = 66,
   kTfLiteBuiltinTransposeConv = 67,
+  kTfLiteBuiltinSparseToDense = 68,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 244919bc87..27e7d25bf1 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -595,6 +595,21 @@ Outputs {
 }
 ```
 
+**SPARSE_TO_DENSE**
+
+```
+Inputs {
+  0: 0D or 1D or 2D tensor
+  1: 1D tensor
+  2: 0D or 1D tensor
+  3: 0D tensor
+  4: a boolean value
+}
+Outputs {
+  0: Dense Tensor of shape output_shape. Has the same type as sparse_values.
+}
+```
+
 **SPLIT**
 
 ```
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index b7291dd379..0af659b5ca 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -170,6 +170,7 @@ cc_library(
         "slice.cc",
         "space_to_batch_nd.cc",
         "space_to_depth.cc",
+        "sparse_to_dense.cc",
         "split.cc",
         "squeeze.cc",
         "strided_slice.cc",
@@ -934,6 +935,19 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "sparse_to_dense_test",
+    size = "small",
+    srcs = ["sparse_to_dense_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 62d6fe0bb3..c43c5f938e 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -4000,6 +4000,42 @@ inline void RankOneSelect(const D* input_condition_data,
   }
 }
 
+// For easy implementation, the indices is always a vector of size-4 vectors.
+template <typename T, typename I>
+inline void SparseToDense(const std::vector<std::vector<I>>& indices,
+                          const T* values, T default_value, T* output_data,
+                          const Dims<4>& output_dims, bool value_is_scalar) {
+  const int value_count = indices.size();
+
+  // First fill the output_data with default value.
+  const int num_elements = FlatSize(output_dims);
+  for (int i = 0; i < num_elements; ++i) {
+    output_data[i] = default_value;
+  }
+
+  // Special handle for value is scalar case to avoid checking the boolean
+  // condition within the loop every time.
+  if (value_is_scalar) {
+    for (int i = 0; i < value_count; ++i) {
+      const std::vector<I>& index = indices[i];
+      TFLITE_DCHECK_EQ(index.size(), 4);
+      const T value = *values;  // just use the first value.
+      output_data[Offset(output_dims, index[3], index[2], index[1], index[0])] =
+          value;
+    }
+    return;
+  }
+
+  // Go through the values and indices to fill the sparse values.
+  for (int i = 0; i < value_count; ++i) {
+    const std::vector<I>& index = indices[i];
+    TFLITE_DCHECK_EQ(index.size(), 4);
+    const T value = values[i];
+    output_data[Offset(output_dims, index[3], index[2], index[1], index[0])] =
+        value;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 21cc185e9f..4eea9921b2 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -90,6 +90,7 @@ TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE();
 TfLiteRegistration* Register_SIN();
 TfLiteRegistration* Register_TRANSPOSE_CONV();
+TfLiteRegistration* Register_SPARSE_TO_DENSE();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -161,6 +162,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
+  AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
new file mode 100644
index 0000000000..404c32ad9c
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sparse_to_dense.cc
@@ -0,0 +1,275 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <unistd.h>
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstdlib>
+#include <iostream>
+#include <limits>
+
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+#include "tensorflow/contrib/lite/kernels/padding.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace sparse_to_dense {
+
+constexpr int kIndicesTensor = 0;
+constexpr int kOutputShapeTensor = 1;
+constexpr int kValueInputTensor = 2;
+constexpr int kDefaultValueTensor = 3;
+constexpr int kOutputTensor = 0;
+
+constexpr int kMaxDimensions = 4;
+
+template <typename T>
+TfLiteStatus Resize(TfLiteContext* context, const TfLiteTensor* output_shape,
+                    TfLiteTensor* output) {
+  const int output_dimensions = NumElements(output_shape);
+  TfLiteIntArray* output_shape_array = TfLiteIntArrayCreate(output_dimensions);
+  for (int i = 0; i < output_dimensions; ++i) {
+    output_shape_array->data[i] = GetTensorData<T>(output_shape)[i];
+  }
+
+  return context->ResizeTensor(context, output, output_shape_array);
+}
+
+TfLiteStatus CheckDimensionsMatch(TfLiteContext* context,
+                                  const TfLiteTensor* indices,
+                                  const TfLiteTensor* output_shape,
+                                  const TfLiteTensor* values) {
+  switch (NumDimensions(indices)) {
+    case 0:
+    case 1: {
+      if (NumDimensions(values) == 0) {
+        TF_LITE_ENSURE_EQ(context, NumElements(indices), NumElements(values));
+      }
+      TF_LITE_ENSURE_EQ(context, NumElements(output_shape), 1);
+      break;
+    }
+    case 2: {
+      TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 1),
+                        NumElements(output_shape));
+      if (NumDimensions(values) == 0)
+        TF_LITE_ENSURE_EQ(context, SizeOfDimension(indices, 0),
+                          NumElements(values));
+      break;
+    }
+    default:
+      context->ReportError(
+          context, "Wrong indices dimensions %d, should be less than 3.",
+          NumDimensions(indices));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// Convert indices into a vector of 4-d vectors.
+// TODO(renjieliu): Revisit here to improve the performance, since multiple
+// allocations of std::vectors will be quite slow on phones.
+template <typename T>
+TfLiteStatus GetIndicesVector(TfLiteContext* context,
+                              const TfLiteTensor* indices,
+                              const int num_indices,
+                              std::vector<std::vector<T>>* indices_vector) {
+  // Note because TfLite will reverse the dimensions, so pad zeros upfront.
+  switch (NumDimensions(indices)) {
+    case 0:
+    case 1: {
+      const auto indices_data = GetTensorData<T>(indices);
+      for (int i = 0; i < num_indices; ++i) {
+        std::vector<T> index({0, 0, 0, indices_data[i]});
+        indices_vector->push_back(index);
+      }
+      break;
+    }
+    case 2: {
+      const int true_dimensions = SizeOfDimension(indices, 1);
+      TF_LITE_ENSURE(context, true_dimensions <= kMaxDimensions);
+      for (int i = 0; i < num_indices; ++i) {
+        std::vector<T> index;
+        index.reserve(kMaxDimensions);
+        // Fill the index with 1 up to kMaxDimensions - true_dimensions to
+        // satisfy the needs for 4-dimension index.
+        for (int j = 0; j < kMaxDimensions - true_dimensions; ++j) {
+          index.push_back(0);
+        }
+        for (int j = 0; j < true_dimensions; ++j) {
+          index.push_back(GetTensorData<T>(indices)[i * true_dimensions + j]);
+        }
+
+        indices_vector->push_back(index);
+      }
+      break;
+    }
+    default:
+      context->ReportError(context,
+                           "Indices dimensions problem, got %d dimensions",
+                           NumDimensions(indices));
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ResizeOutputShape(TfLiteContext* context,
+                               const TfLiteTensor* output_shape,
+                               TfLiteTensor* output) {
+  if (output_shape->type == kTfLiteInt32) {
+    return Resize<int32_t>(context, output_shape, output);
+  } else if (output_shape->type == kTfLiteInt64) {
+    return Resize<int64_t>(context, output_shape, output);
+  } else {
+    context->ReportError(context, "Dense shape type %d not supported.",
+                         output_shape->type);
+    return kTfLiteError;
+  }
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 4);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+  const TfLiteTensor* default_value =
+      GetInput(context, node, kDefaultValueTensor);
+
+  // TODO(renjieliu): Handle validate_indices.
+
+  // Indices can be 0-D, 1-D or 2-D.
+  TF_LITE_ASSERT(NumDimensions(indices) >= 0);
+  TF_LITE_ENSURE(context, NumDimensions(indices) < 3);
+  TF_LITE_ASSERT(NumDimensions(output_shape) >= 0);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+  // Values can be 0-D or 1-D.
+  TF_LITE_ASSERT(NumDimensions(values) >= 0);
+  TF_LITE_ENSURE(context, NumDimensions(values) < 2);
+
+  TF_LITE_ENSURE_EQ(context, NumElements(default_value), 1);
+
+  TF_LITE_ENSURE(
+      context, indices->type == kTfLiteInt32 || indices->type == kTfLiteInt64);
+  TF_LITE_ENSURE(context, output_shape->type == kTfLiteInt32 ||
+                              output_shape->type == kTfLiteInt64);
+  TF_LITE_ENSURE_EQ(context, values->type, default_value->type);
+
+  // Ensure dimensions match.
+  TF_LITE_ENSURE_OK(
+      context, CheckDimensionsMatch(context, indices, output_shape, values));
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, NumDimensions(output_shape), 1);
+
+  if (!IsConstantTensor(output_shape)) {
+    SetTensorToDynamic(output);
+    return kTfLiteOk;
+  }
+  return ResizeOutputShape(context, output_shape, output);
+}
+
+template <typename T, typename I>
+TfLiteStatus SparseToDenseImpl(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
+  const TfLiteTensor* output_shape =
+      GetInput(context, node, kOutputShapeTensor);
+  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+  const TfLiteTensor* default_value =
+      GetInput(context, node, kDefaultValueTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context,
+                      ResizeOutputShape(context, output_shape, output));
+  }
+
+  const int num_indices = SizeOfDimension(indices, 0);
+  const bool value_is_scalar = NumDimensions(values) == 0;
+  std::vector<std::vector<I>> indices_vector;
+  indices_vector.reserve(num_indices);
+  TF_LITE_ENSURE_OK(context, GetIndicesVector<I>(context, indices, num_indices,
+                                                 &indices_vector));
+  reference_ops::SparseToDense(indices_vector, GetTensorData<T>(values),
+                               *GetTensorData<T>(default_value),
+                               GetTensorData<T>(output), GetTensorDims(output),
+                               value_is_scalar);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* indices = GetInput(context, node, kIndicesTensor);
+  const TfLiteTensor* values = GetInput(context, node, kValueInputTensor);
+
+  // Currently only supports float32 and int32.
+  switch (values->type) {
+    case kTfLiteFloat32: {
+      switch (indices->type) {
+        case kTfLiteInt32: {
+          return SparseToDenseImpl<float, int32_t>(context, node);
+        }
+        case kTfLiteInt64: {
+          return SparseToDenseImpl<float, int64_t>(context, node);
+        }
+        default:
+          context->ReportError(
+              context, "Type %d is currently not supported by sparse to dense.",
+              indices->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    case kTfLiteInt32: {
+      switch (indices->type) {
+        case kTfLiteInt32: {
+          return SparseToDenseImpl<int32_t, int32_t>(context, node);
+        }
+        case kTfLiteInt64: {
+          return SparseToDenseImpl<int32_t, int64_t>(context, node);
+        }
+        default:
+          context->ReportError(
+              context, "Type %d is currently not supported by sparse to dense.",
+              indices->type);
+          return kTfLiteError;
+      }
+      break;
+    }
+    default:
+      context->ReportError(
+          context, "Type %d is currently not supported by sparse to dense.",
+          values->type);
+      return kTfLiteError;
+  }
+}
+
+}  // namespace sparse_to_dense
+
+TfLiteRegistration* Register_SPARSE_TO_DENSE() {
+  static TfLiteRegistration r = {nullptr, nullptr, sparse_to_dense::Prepare,
+                                 sparse_to_dense::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc b/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc
new file mode 100644
index 0000000000..a51ec17afc
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/sparse_to_dense_test.cc
@@ -0,0 +1,155 @@
+
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class SparseToDenseOpModel : public SingleOpModel {
+ public:
+  SparseToDenseOpModel(std::initializer_list<int> indices_shape,
+                       std::initializer_list<int> output_shape_shape,
+                       std::initializer_list<int> values_shape, T default_value,
+                       TensorType tensor_index_type,
+                       TensorType tensor_input_type) {
+    indices_ = AddInput(tensor_index_type);
+    output_shape_ = AddInput(TensorType_INT32);
+    values_ = AddInput(tensor_input_type);
+    default_value_ = AddInput(tensor_input_type);
+    output_ = AddOutput(tensor_input_type);
+
+    SetBuiltinOp(BuiltinOperator_SPARSE_TO_DENSE,
+                 BuiltinOptions_SparseToDenseOptions,
+                 CreateSparseToDenseOptions(builder_, false).Union());
+    BuildInterpreter({indices_shape, output_shape_shape, values_shape, {1}});
+
+    PopulateTensor<T>(default_value_, {default_value});
+  }
+
+  int indices() { return indices_; }
+  int output_shape() { return output_shape_; }
+  int values() { return values_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ private:
+  int indices_;
+  int output_shape_;
+  int values_;
+  int default_value_;
+  int output_;
+};
+
+TEST(SparseToDenseOpModelTest, ZeroDimensionTest) {
+  SparseToDenseOpModel<float> m({1}, {1}, {1}, 0, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {3});
+  m.PopulateTensor<int32_t>(m.output_shape(), {5});
+  m.PopulateTensor<float>(m.values(), {7});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 7, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({5}));
+}
+
+TEST(SparseToDenseOpModelTest, OneDimensionTest) {
+  SparseToDenseOpModel<float> m({3}, {1}, {3}, 0, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {1, 3, 5});
+  m.PopulateTensor<int32_t>(m.output_shape(), {7});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 0, 4, 0, 6, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({7}));
+}
+
+TEST(SparseToDenseOpModelTest, TwoDimensionsTest) {
+  SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, 0, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray({2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                0, 0, 4, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SparseToDenseOpModelTest, DefaultValueTest) {
+  SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({2,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                        -1, -1, 4,  -1, -1, 6,  -1, -1, -1, -1, -1, -1, -1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SparseToDenseOpModelTest, IntegerValueTest) {
+  SparseToDenseOpModel<int32_t> m({3, 3}, {3}, {3}, -1, TensorType_INT32,
+                                  TensorType_INT32);
+  m.PopulateTensor<int32_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<int32_t>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({2,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                        -1, -1, 4,  -1, -1, 6,  -1, -1, -1, -1, -1, -1, -1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+TEST(SparseToDenseOpModelTest, Int64IndexTest) {
+  SparseToDenseOpModel<float> m({3, 3}, {3}, {3}, -1, TensorType_INT64,
+                                TensorType_FLOAT32);
+  m.PopulateTensor<int64_t>(m.indices(), {0, 0, 0, 1, 2, 1, 2, 0, 1});
+  m.PopulateTensor<int32_t>(m.output_shape(), {3, 3, 3});
+  m.PopulateTensor<float>(m.values(), {2, 4, 6});
+  m.Invoke();
+
+  EXPECT_THAT(
+      m.GetOutput(),
+      ElementsAreArray({2,  -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+                        -1, -1, 4,  -1, -1, 6,  -1, -1, -1, -1, -1, -1, -1}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 3, 3}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 80fcb28bc7..6ac41a94bd 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -699,6 +699,16 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_SPARSE_TO_DENSE: {
+      TfLiteSparseToDenseParams* params =
+          MallocPOD<TfLiteSparseToDenseParams>();
+      if (auto* sparse_to_dense_params =
+              op->builtin_options_as_SparseToDenseOptions()) {
+        params->validate_indices = sparse_to_dense_params->validate_indices();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_DELEGATE: {
       // TODO(ycling): Revisit when supporting saving delegated models.
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index eed57d412b..fad08bbfe6 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -491,6 +491,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SLICE:
       case tflite::BuiltinOperator_SIN:
       case tflite::BuiltinOperator_TRANSPOSE_CONV:
+      case tflite::BuiltinOperator_SPARSE_TO_DENSE:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 8bdeb035f5..522eac25b3 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -145,6 +145,7 @@ enum BuiltinOperator : byte {
   SLICE = 65,
   SIN = 66,
   TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
 }
 
 // Options for the builtin operators.
@@ -198,6 +199,7 @@ union BuiltinOptions {
   SelectOptions,
   SliceOptions,
   TransposeConvOptions,
+  SparseToDenseOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -450,6 +452,10 @@ table TransposeConvOptions {
   stride_h:int;
 }
 
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 35c34f53a6..746dd26796 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -178,6 +178,9 @@ struct SliceOptionsT;
 struct TransposeConvOptions;
 struct TransposeConvOptionsT;
 
+struct SparseToDenseOptions;
+struct SparseToDenseOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -305,11 +308,12 @@ enum BuiltinOperator {
   BuiltinOperator_SLICE = 65,
   BuiltinOperator_SIN = 66,
   BuiltinOperator_TRANSPOSE_CONV = 67,
+  BuiltinOperator_SPARSE_TO_DENSE = 68,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_TRANSPOSE_CONV
+  BuiltinOperator_MAX = BuiltinOperator_SPARSE_TO_DENSE
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[67] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[68] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -377,7 +381,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[67] {
     BuiltinOperator_SELECT,
     BuiltinOperator_SLICE,
     BuiltinOperator_SIN,
-    BuiltinOperator_TRANSPOSE_CONV
+    BuiltinOperator_TRANSPOSE_CONV,
+    BuiltinOperator_SPARSE_TO_DENSE
   };
   return values;
 }
@@ -452,6 +457,7 @@ inline const char **EnumNamesBuiltinOperator() {
     "SLICE",
     "SIN",
     "TRANSPOSE_CONV",
+    "SPARSE_TO_DENSE",
     nullptr
   };
   return names;
@@ -513,11 +519,12 @@ enum BuiltinOptions {
   BuiltinOptions_SelectOptions = 47,
   BuiltinOptions_SliceOptions = 48,
   BuiltinOptions_TransposeConvOptions = 49,
+  BuiltinOptions_SparseToDenseOptions = 50,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_TransposeConvOptions
+  BuiltinOptions_MAX = BuiltinOptions_SparseToDenseOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[50] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[51] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -568,7 +575,8 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[50] {
     BuiltinOptions_LessEqualOptions,
     BuiltinOptions_SelectOptions,
     BuiltinOptions_SliceOptions,
-    BuiltinOptions_TransposeConvOptions
+    BuiltinOptions_TransposeConvOptions,
+    BuiltinOptions_SparseToDenseOptions
   };
   return values;
 }
@@ -625,6 +633,7 @@ inline const char **EnumNamesBuiltinOptions() {
     "SelectOptions",
     "SliceOptions",
     "TransposeConvOptions",
+    "SparseToDenseOptions",
     nullptr
   };
   return names;
@@ -835,6 +844,10 @@ template<> struct BuiltinOptionsTraits<TransposeConvOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_TransposeConvOptions;
 };
 
+template<> struct BuiltinOptionsTraits<SparseToDenseOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SparseToDenseOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1258,6 +1271,14 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_TransposeConvOptions ?
       reinterpret_cast<const TransposeConvOptionsT *>(value) : nullptr;
   }
+  SparseToDenseOptionsT *AsSparseToDenseOptions() {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<SparseToDenseOptionsT *>(value) : nullptr;
+  }
+  const SparseToDenseOptionsT *AsSparseToDenseOptions() const {
+    return type == BuiltinOptions_SparseToDenseOptions ?
+      reinterpret_cast<const SparseToDenseOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -4543,6 +4564,60 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
 
 flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct SparseToDenseOptionsT : public flatbuffers::NativeTable {
+  typedef SparseToDenseOptions TableType;
+  bool validate_indices;
+  SparseToDenseOptionsT()
+      : validate_indices(false) {
+  }
+};
+
+struct SparseToDenseOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SparseToDenseOptionsT NativeTableType;
+  enum {
+    VT_VALIDATE_INDICES = 4
+  };
+  bool validate_indices() const {
+    return GetField<uint8_t>(VT_VALIDATE_INDICES, 0) != 0;
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<uint8_t>(verifier, VT_VALIDATE_INDICES) &&
+           verifier.EndTable();
+  }
+  SparseToDenseOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SparseToDenseOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SparseToDenseOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_validate_indices(bool validate_indices) {
+    fbb_.AddElement<uint8_t>(SparseToDenseOptions::VT_VALIDATE_INDICES, static_cast<uint8_t>(validate_indices), 0);
+  }
+  explicit SparseToDenseOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SparseToDenseOptionsBuilder &operator=(const SparseToDenseOptionsBuilder &);
+  flatbuffers::Offset<SparseToDenseOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SparseToDenseOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    bool validate_indices = false) {
+  SparseToDenseOptionsBuilder builder_(_fbb);
+  builder_.add_validate_indices(validate_indices);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -4821,6 +4896,9 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const TransposeConvOptions *builtin_options_as_TransposeConvOptions() const {
     return builtin_options_type() == BuiltinOptions_TransposeConvOptions ? static_cast<const TransposeConvOptions *>(builtin_options()) : nullptr;
   }
+  const SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const {
+    return builtin_options_type() == BuiltinOptions_SparseToDenseOptions ? static_cast<const SparseToDenseOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -5043,6 +5121,10 @@ template<> inline const TransposeConvOptions *Operator::builtin_options_as<Trans
   return builtin_options_as_TransposeConvOptions();
 }
 
+template<> inline const SparseToDenseOptions *Operator::builtin_options_as<SparseToDenseOptions>() const {
+  return builtin_options_as_SparseToDenseOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -6862,6 +6944,32 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flat
       _stride_h);
 }
 
+inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SparseToDenseOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SparseToDenseOptions::UnPackTo(SparseToDenseOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = validate_indices(); _o->validate_indices = _e; };
+}
+
+inline flatbuffers::Offset<SparseToDenseOptions> SparseToDenseOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSparseToDenseOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SparseToDenseOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _validate_indices = _o->validate_indices;
+  return tflite::CreateSparseToDenseOptions(
+      _fbb,
+      _validate_indices);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -7244,6 +7352,10 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const TransposeConvOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const SparseToDenseOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -7458,6 +7570,10 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const TransposeConvOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const SparseToDenseOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -7660,6 +7776,10 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const TransposeConvOptionsT *>(value);
       return CreateTransposeConvOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<const SparseToDenseOptionsT *>(value);
+      return CreateSparseToDenseOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -7862,6 +7982,10 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new TransposeConvOptionsT(*reinterpret_cast<TransposeConvOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_SparseToDenseOptions: {
+      value = new SparseToDenseOptionsT(*reinterpret_cast<SparseToDenseOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -8114,6 +8238,11 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_SparseToDenseOptions: {
+      auto ptr = reinterpret_cast<SparseToDenseOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 13fafebd1d..ae66bd858b 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -146,8 +146,9 @@ def toco_options(data_types,
        " --inference_type=%s" % inference_type +
        " --input_format=TENSORFLOW_GRAPHDEF" + " --output_format=TFLITE" +
        " --input_arrays=%s" % ",".join(input_arrays) +
-       " --input_shapes=%s" % shape_str +
        " --output_arrays=%s" % ",".join(output_arrays))
+  if shape_str:
+    s += (" --input_shapes=%s" % shape_str)
   if extra_toco_options.drop_control_dependency:
     s += " --drop_control_dependency"
   if extra_toco_options.allow_custom_ops:
@@ -238,6 +239,19 @@ def create_tensor_data(dtype, shape, min_value=-100, max_value=100):
   return value.astype(dtype)
 
 
+def create_scalar_data(dtype, min_value=-100, max_value=100):
+  """Build scalar tensor data range from min_value to max_value exclusively."""
+
+  if dtype in _TF_TYPE_INFO:
+    dtype = _TF_TYPE_INFO[dtype][0]
+
+  if dtype in (tf.float32, tf.float16):
+    value = (max_value - min_value) * np.random.random() + min_value
+  elif dtype in (tf.int32, tf.uint8, tf.int64):
+    value = np.random.randint(min_value, max_value + 1)
+  return np.array(value, dtype=dtype)
+
+
 def freeze_graph(session, outputs):
   """Freeze the current graph.
 
@@ -2485,6 +2499,67 @@ def make_transpose_conv_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_sparse_to_dense_tests(zip_path):
+  """Make a set of tests to do sparse to dense."""
+
+  test_parameters = [{
+      "value_dtype": [tf.float32, tf.int32],
+      "index_dtype": [tf.int32, tf.int64],
+      "value_count": [1, 3, 6, 8],
+      "dense_shape": [[15], [3, 10], [4, 4, 4, 4], [7, 10, 9]],
+      "default_value": [0, -1],
+      "value_is_scalar": [True, False],
+  }]
+
+  # Return a single value for 1-D dense shape, but a tuple for other shapes.
+  def generate_index(dense_shape):
+    if len(dense_shape) == 1:
+      return np.random.randint(dense_shape[0])
+    else:
+      index = []
+      for shape in dense_shape:
+        index.append(np.random.randint(shape))
+      return tuple(index)
+
+  def build_graph(parameters):
+    """Build the sparse_to_dense op testing graph."""
+    dense_shape = parameters["dense_shape"]
+
+    # Special handle for value_is_scalar case.
+    # value_count must be 1.
+    if parameters["value_is_scalar"] and parameters["value_count"] == 1:
+      value = tf.placeholder(
+          name="value", dtype=parameters["value_dtype"], shape=())
+    else:
+      value = tf.placeholder(
+          name="value",
+          dtype=parameters["value_dtype"],
+          shape=[parameters["value_count"]])
+    indices = set()
+    while len(indices) < parameters["value_count"]:
+      indices.add(generate_index(dense_shape))
+    indices = tf.constant(tuple(indices), dtype=parameters["index_dtype"])
+    # TODO(renjieliu): Add test for validate_indices case.
+    out = tf.sparse_to_dense(
+        indices,
+        dense_shape,
+        value,
+        parameters["default_value"],
+        validate_indices=False)
+
+    return [value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    if parameters["value_is_scalar"] and parameters["value_count"] == 1:
+      input_value = create_scalar_data(parameters["value_dtype"])
+    else:
+      input_value = create_tensor_data(parameters["value_dtype"],
+                                       [parameters["value_count"]])
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index f5157149af..99f0c81a1b 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1728,6 +1728,25 @@ void ConvertComparisonOperator(const Model& model, const Operator& src_op,
   (*comparison_op->mutable_attr())["T"].set_type(data_type);
 }
 
+void ConvertSparseToDenseOperator(const Model& model,
+                                  const SparseToDenseOperator& src_op,
+                                  const char* op_name,
+                                  GraphDef* tensorflow_graph) {
+  auto* sparse_to_dense_op = tensorflow_graph->add_node();
+  sparse_to_dense_op->set_op(op_name);
+  sparse_to_dense_op->set_name(src_op.outputs[0]);
+  CHECK_EQ(src_op.inputs.size(), 4);
+  for (int i = 0; i < 4; ++i) {
+    *sparse_to_dense_op->add_input() = src_op.inputs[i];
+  }
+  const auto data_type = GetTensorFlowDataType(model, src_op.inputs[3]);
+  (*sparse_to_dense_op->mutable_attr())["T"].set_type(data_type);
+  const auto index_type = GetTensorFlowDataType(model, src_op.inputs[0]);
+  (*sparse_to_dense_op->mutable_attr())["Tindices"].set_type(index_type);
+  (*sparse_to_dense_op->mutable_attr())["Tindices"].set_b(
+      src_op.validate_indices);
+}
+
 void ConvertOperator(const Model& model, const Operator& src_op,
                      GraphDef* tensorflow_graph) {
   if (src_op.fused_activation_function != FusedActivationFunctionType::kNone) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 6342cf3e8a..64096fb069 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -163,6 +163,16 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
       SetDataTypeForAllOutputs(model, op, data_type_x);
       break;
     }
+    case OperatorType::kSparseToDense: {
+      // Select produces outputs with the same type as their 3rd input
+      CHECK_EQ(op->inputs.size(), 4);
+      const ArrayDataType data_type = model->GetArray(op->inputs[2]).data_type;
+      const ArrayDataType data_type_default =
+          model->GetArray(op->inputs[3]).data_type;
+      CHECK(data_type == data_type_default);
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
     default: {
       // These operators produce outputs with the same type as their 1st input
       CHECK_GT(op->inputs.size(), 0);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 9d1d27f3ef..adb241da32 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1477,6 +1477,34 @@ void ProcessArgMaxOperator(Model* model, ArgMaxOperator* op) {
   *output_array.mutable_shape()->mutable_dims() = output_dims;
 }
 
+void ProcessSparseToDenseOperator(Model* model, SparseToDenseOperator* op) {
+  CHECK_EQ(op->inputs.size(), 4);
+
+  const Array& output_shape_array = model->GetArray(op->inputs[1]);
+  if (!output_shape_array.has_shape()) return;
+  CHECK_EQ(output_shape_array.shape().dimensions_count(), 1);
+
+  // Output should not go over four dimensions.
+  CHECK_LE(output_shape_array.shape().dims(0), 4);
+
+  const string& output_name = op->outputs[0];
+  Array& output_array = model->GetArray(output_name);
+  if (output_array.has_shape()) return;
+
+  CHECK(output_shape_array.data_type == ArrayDataType::kInt32 ||
+        output_shape_array.data_type == ArrayDataType::kInt64);
+  if (output_shape_array.data_type == ArrayDataType::kInt32) {
+    *output_array.mutable_shape()->mutable_dims() =
+        output_shape_array.GetBuffer<ArrayDataType::kInt32>().data;
+  } else {
+    const std::vector<int64>& output_shape_data =
+        output_shape_array.GetBuffer<ArrayDataType::kInt64>().data;
+    std::copy(
+        output_shape_data.begin(), output_shape_data.end(),
+        std::back_inserter(*output_array.mutable_shape()->mutable_dims()));
+  }
+}
+
 }  // namespace
 
 bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
@@ -1700,6 +1728,10 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
       CHECK_EQ(op->inputs.size(), 1);
       ProcessOpWithShapeInput(model, op);
       break;
+    case OperatorType::kSparseToDense:
+      ProcessSparseToDenseOperator(model,
+                                   static_cast<SparseToDenseOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 27e9d1af88..94ec7c24d4 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -2133,6 +2133,24 @@ void ConvertDynamicStitchOperator(const NodeDef& node,
   model->operators.emplace_back(op.release());
 }
 
+void ConvertSparseToDenseOperator(const NodeDef& node,
+                                  const TensorFlowImportFlags& tf_import_flags,
+                                  Model* model) {
+  CHECK_EQ(node.op(), "SparseToDense");
+  CheckInputsCount(node, tf_import_flags, 4);
+
+  auto* op = new SparseToDenseOperator;
+  for (const string& input : node.input()) {
+    op->inputs.push_back(input);
+  }
+  op->outputs.push_back(node.name());
+
+  op->validate_indices = HasAttr(node, "validate_indices")
+                             ? GetBoolAttr(node, "validate_indices")
+                             : true;
+  model->operators.emplace_back(op);
+}
+
 }  // namespace
 
 namespace internal {
@@ -2314,6 +2332,8 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
     ConvertSinOperator(node, tf_import_flags, model);
   } else if (node.op() == "Select") {
     ConvertSelectOperator(node, tf_import_flags, model);
+  } else if (node.op() == "SparseToDense") {
+    ConvertSparseToDenseOperator(node, tf_import_flags, model);
   } else {
     ConvertUnsupportedOperator(node, tf_import_flags, model);
   }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index d878ac54e4..9062c03c73 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -135,6 +135,7 @@ enum class OperatorType {
   // special nodes in the graph to shuffle axes.
   kReorderAxes,
   kSelect,
+  kSparseToDense,
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -1598,6 +1599,19 @@ struct DynamicStitchOperator : Operator {
   int num_partitions;
 };
 
+// SparseToDense operator:
+//
+// Inputs:
+// Inputs[0]: required: sparse_indices.
+// Inputs[1]: required: output_shape.
+// Inputs[2]: required: sparse_values.
+//
+// TensorFlow equivalent: SparseToDense.
+struct SparseToDenseOperator : Operator {
+  SparseToDenseOperator() : Operator(OperatorType::kSparseToDense) {}
+  bool validate_indices;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 6922e5055a..8f0f2e24db 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -794,6 +794,27 @@ class TransposeConv
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class SparseToDense
+    : public BuiltinOperator<SparseToDenseOperator,
+                             ::tflite::SparseToDenseOptions,
+                             ::tflite::BuiltinOptions_SparseToDenseOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSparseToDenseOptions(*builder, op.validate_indices);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->validate_indices = options.validate_indices();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -978,6 +999,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
   ops.emplace_back(new TransposeConv(::tflite::BuiltinOperator_TRANSPOSE_CONV,
                                      OperatorType::kTransposeConv));
+  ops.emplace_back(new SparseToDense(::tflite::BuiltinOperator_SPARSE_TO_DENSE,
+                                     OperatorType::kSparseToDense));
 
   // Custom Operators.
   ops.emplace_back(
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index fe594c6da9..d63c99a5f9 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -420,6 +420,15 @@ TEST_F(OperatorTest, BuiltinTransposeConv) {
   EXPECT_EQ(op.padding.type, output_toco_op->padding.type);
 }
 
+TEST_F(OperatorTest, BuiltinSparseToDense) {
+  SparseToDenseOperator op;
+  op.validate_indices = false;
+  std::unique_ptr<toco::SparseToDenseOperator> output_toco_op =
+      SerializeAndDeserialize(
+          GetOperator("SPARSE_TO_DENSE", OperatorType::kSparseToDense), op);
+  EXPECT_EQ(op.validate_indices, output_toco_op->validate_indices);
+}
+
 TEST_F(OperatorTest, TensorFlowUnsupported) {
   TensorFlowUnsupportedOperator op;
   op.tensorflow_op = "MyCustomUnsupportedOp";
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 1e6314f2dc..fe7bed885d 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -393,6 +393,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(DynamicPartition)
     HANDLE_OPERATORTYPENAME_CASE(DynamicStitch)
     HANDLE_OPERATORTYPENAME_CASE(Select)
+    HANDLE_OPERATORTYPENAME_CASE(SparseToDense)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
-- 
GitLab


From 582f2e61c7219bfbbec21ce087bee9fde26bce7c Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 31 May 2018 06:57:55 -0700
Subject: [PATCH 096/610] [tf.data] Scaling down the `batch_dataset_op_test`.

PiperOrigin-RevId: 198715407
---
 tensorflow/contrib/data/python/kernel_tests/BUILD         | 2 +-
 .../data/python/kernel_tests/batch_dataset_op_test.py     | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 285c77dea9..c483a43769 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -8,7 +8,7 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test", "py_test", "tf_py_test")
 
 py_test(
     name = "batch_dataset_op_test",
-    size = "large",
+    size = "medium",
     srcs = ["batch_dataset_op_test.py"],
     srcs_version = "PY2AND3",
     tags = [
diff --git a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
index e309d611e1..b5fbc45ad3 100644
--- a/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/batch_dataset_op_test.py
@@ -553,14 +553,14 @@ class BatchDatasetTest(test.TestCase):
         sess.run(next_element)
 
   def testMapAndBatchParallelGetNext(self):
-    iterator = (dataset_ops.Dataset.range(500000)
+    iterator = (dataset_ops.Dataset.range(50000)
                 .apply(batching.map_and_batch(lambda x: x, batch_size=100))
                 .make_one_shot_iterator())
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
     with self.test_session() as sess:
-      for i in range(50):
+      for i in range(5):
         got = sess.run(elements)
         got.sort(key=lambda x: x[0])
         expected = []
@@ -572,7 +572,7 @@ class BatchDatasetTest(test.TestCase):
 
   def testMapAndBatchParallelGetNextDropRemainder(self):
     iterator = (
-        dataset_ops.Dataset.range(499999).apply(
+        dataset_ops.Dataset.range(49999).apply(
             batching.map_and_batch(
                 lambda x: x, batch_size=100, drop_remainder=True))
         .make_one_shot_iterator())
@@ -580,7 +580,7 @@ class BatchDatasetTest(test.TestCase):
     for _ in range(100):
       elements.append(iterator.get_next())
     with self.test_session() as sess:
-      for i in range(49):
+      for i in range(4):
         got = sess.run(elements)
         got.sort(key=lambda x: x[0])
         expected = []
-- 
GitLab


From a951093889128db4acf4ed80a286ebb2de813241 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 31 May 2018 07:56:56 -0700
Subject: [PATCH 097/610] Make GraphConstructor create nodes in the same order
 as the GraphDef.

While technically the order of the created nodes doesn't matter, this
makes viewing and debugging graphs more sensible.

Fixes #19594.

PiperOrigin-RevId: 198721173
---
 .../jit/encapsulate_subgraphs_pass_test.cc    |  8 ++---
 .../contrib/tensorrt/segment/segment_test.cc  |  4 +--
 .../core/common_runtime/function_test.cc      |  2 +-
 tensorflow/core/graph/algorithm_test.cc       |  4 +--
 tensorflow/core/graph/graph_constructor.cc    | 15 +++++----
 tensorflow/core/graph/graph_partition_test.cc | 16 +++++-----
 tensorflow/core/graph/optimizer_cse_test.cc   | 32 +++++++++----------
 7 files changed, 41 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
index 5ec24d39a2..eef113a354 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass_test.cc
@@ -1050,7 +1050,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
                          .WithAttr("_outside", "O1"));
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
                              {DT_FLOAT, DT_FLOAT}, shape2.opts());
-    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+    Node* h = Binary(ops::NodeOut(recv2, 1), e,
                      shape2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F1")
@@ -1075,7 +1075,7 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
            {"outside_compilation_O1_host_compute"}},
           {{"outside_compilation_O2_host_compute"},
            "XlaHostCompute",
-           {"D:o:0", "F:o:0"},
+           {"F:o:0", "D:o:0"},
            {{"Tinputs", gtl::ArraySlice<DataType>({DT_FLOAT, DT_FLOAT})},
             {"Toutputs", gtl::ArraySlice<DataType>({DT_FLOAT})},
             {"ancestors",
@@ -1123,13 +1123,13 @@ TEST(EncapsulateSubgraphsTest, OneFunctionTwoOutside) {
 
     Node* recv2 = RecvAtHost(ops::NodeOut(key_constant, 0), "F1", "O2",
                              {DT_FLOAT, DT_FLOAT}, b2.opts());
-    Node* g = Binary(e, ops::NodeOut(recv2, 1),
+    Node* g = Binary(e, ops::NodeOut(recv2, 0),
                      b2.opts()
                          .WithName("G")
                          .WithControlInputs({recv2, e})
                          .WithAttr("_encapsulate", "F1")
                          .WithAttr("_outside", "O2"));
-    Node* h = Binary(ops::NodeOut(recv2, 0), e,
+    Node* h = Binary(ops::NodeOut(recv2, 1), e,
                      b2.opts()
                          .WithName("H")
                          .WithAttr("_encapsulate", "F1")
diff --git a/tensorflow/contrib/tensorrt/segment/segment_test.cc b/tensorflow/contrib/tensorrt/segment/segment_test.cc
index 2de3923b06..f5b2d258d7 100644
--- a/tensorflow/contrib/tensorrt/segment/segment_test.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment_test.cc
@@ -275,13 +275,13 @@ TEST_F(SegmentTest, Multiple) {
   // Expect two subgraphs
   EXPECT_EQ(segments.size(), 2);
 
-  std::vector<string> expected0{"add0", "add1", "add2", "add3"};
+  std::vector<string> expected0{"add6", "add8"};
   for (const auto& ex : expected0) {
     EXPECT_TRUE(segments[0].first.find(ex) != segments[0].first.end())
         << "Missing expected node " << ex;
   }
 
-  std::vector<string> expected1{"add6", "add8"};
+  std::vector<string> expected1{"add0", "add1", "add2", "add3"};
   for (const auto& ex : expected1) {
     EXPECT_TRUE(segments[1].first.find(ex) != segments[1].first.end())
         << "Missing expected node " << ex;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 61b2f0e60f..f4f5198396 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -845,7 +845,7 @@ TEST_F(FunctionLibraryRuntimeTest, ManySwapsNodeDef) {
   ASSERT_TRUE(g != nullptr);
   OptimizeGraph(flr0_, &g);
   const char* e0 = R"P(
-(n3:float, n2:float) -> (n3:float) {
+(n2:float, n3:float) -> (n2:float) {
 }
 )P";
   EXPECT_EQ(e0, DebugString(g.get()));
diff --git a/tensorflow/core/graph/algorithm_test.cc b/tensorflow/core/graph/algorithm_test.cc
index 99ced0c0f5..f67d5a2fd2 100644
--- a/tensorflow/core/graph/algorithm_test.cc
+++ b/tensorflow/core/graph/algorithm_test.cc
@@ -144,8 +144,8 @@ TEST(AlgorithmTest, ReversePostOrderStable) {
     std::vector<Node*> order;
 
     // Test reverse post order generates expected ordering.
-    GetReversePostOrder(g, &order, /*stable_comparator=*/NodeComparatorID());
-    EXPECT_TRUE(ExpectBefore({{"t3", "t2"}}, order, &error));
+    GetReversePostOrder(g, &order, /*stable_comparator=*/NodeComparatorName());
+    EXPECT_TRUE(ExpectBefore({{"t2", "t3"}}, order, &error));
   }
 }
 }  // namespace
diff --git a/tensorflow/core/graph/graph_constructor.cc b/tensorflow/core/graph/graph_constructor.cc
index 2fd32c0bd4..0967492d92 100644
--- a/tensorflow/core/graph/graph_constructor.cc
+++ b/tensorflow/core/graph/graph_constructor.cc
@@ -278,8 +278,9 @@ class GraphConstructor {
   // name, the value is the new unique name.
   std::unordered_map<string, string> uniquified_names_;
 
-  // Index of NodeDefs in node_defs_ with all inputs already converted.
-  std::vector<int> ready_;
+  // Index of NodeDefs in node_defs_ with all inputs already converted. We use a
+  // (sorted) set so nodes are created in the order defined in the GraphDef.
+  std::set<int> ready_;
 
   // Mapping between index within node_defs_ and the number of inputs that
   // still need to be converted.
@@ -520,7 +521,7 @@ Status GraphConstructor::InitFromEdges() {
       }
     }
     if (pending_count == 0) {
-      ready_.push_back(n);
+      ready_.insert(n);
     }
     pending_count_.push_back(pending_count);
   }
@@ -884,12 +885,12 @@ namespace {
 
 void UpdatePendingCountAndReady(
     const std::vector<gtl::InlinedVector<int, 4>>& outputs, int o,
-    std::vector<int>* pending_count, std::vector<int>* ready) {
+    std::vector<int>* pending_count, std::set<int>* ready) {
   for (size_t i = 0; i < outputs[o].size(); ++i) {
     const int output = outputs[o][i];
     (*pending_count)[output]--;
     if ((*pending_count)[output] == 0) {
-      ready->push_back(output);
+      ready->insert(output);
     }
   }
 }
@@ -913,8 +914,8 @@ Status GraphConstructor::Convert() {
   // inputs, pending_counts_ with the number of inputs for each node and
   // outputs_ with the outputs of each node).
   while (!ready_.empty()) {
-    int o = ready_.back();
-    ready_.pop_back();
+    int o = *ready_.begin();
+    ready_.erase(ready_.begin());
     ++processed;
     inputs.clear();
     bool has_data_back_edge = false;
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index 83b24cafe2..f44ed47a6e 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -329,11 +329,11 @@ TEST_F(GraphPartitionTest, CrossDeviceControl_MultiUse) {
   string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
   auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
-  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_1_A1", a, 82, b);
+  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_3_A1", a, 82, b);
   ExpectMatchA();
 
   auto recv =
-      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_1_A1", a, 82, b);
+      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_3_A1", a, 82, b);
   auto id = Identity(scope_b_.WithOpName("A1/_3"), recv);
   b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2").WithControlDependencies(id), b1, b1);
@@ -353,18 +353,18 @@ TEST_F(GraphPartitionTest, CrossDevice_DataControl) {
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
   a1 = FloatInput(scope_a_.WithOpName("A1"));
-  auto c = Const(scope_a_.WithOpName("A1/_0").WithControlDependencies(a1), {});
+  _Send(scope_a_.WithOpName("A1/_0"), a1, "edge_1_A1", a, 82, b);
+  auto c = Const(scope_a_.WithOpName("A1/_2").WithControlDependencies(a1), {});
   // NOTE: Send 0 A1/_1 -> A1/_2 is not necessarily needed. We could
   // use A1/_0 -> A1/_4 as the control as a minor optimization.
-  _Send(scope_a_.WithOpName("A1/_1"), c, "edge_1_A1", a, 82, b);
-  _Send(scope_a_.WithOpName("A1/_4"), a1, "edge_2_A1", a, 82, b);
+  _Send(scope_a_.WithOpName("A1/_3"), c, "edge_3_A1", a, 82, b);
   ExpectMatchA();
 
   auto recv1 =
-      _Recv(scope_b_.WithOpName("A1/_2"), DT_FLOAT, "edge_1_A1", a, 82, b);
-  auto id1 = Identity(scope_b_.WithOpName("A1/_3"), recv1);
+      _Recv(scope_b_.WithOpName("A1/_4"), DT_FLOAT, "edge_3_A1", a, 82, b);
+  auto id1 = Identity(scope_b_.WithOpName("A1/_5"), recv1);
   auto recv2 =
-      _Recv(scope_b_.WithOpName("A1/_5"), DT_FLOAT, "edge_2_A1", a, 82, b);
+      _Recv(scope_b_.WithOpName("A1/_1"), DT_FLOAT, "edge_1_A1", a, 82, b);
   b1 = FloatInput(scope_b_.WithOpName("B1"));
   Combine(scope_b_.WithOpName("B2"), recv2, b1);
   FloatInput(scope_b_.WithOpName("B3").WithControlDependencies(id1));
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 21a63662cf..c1f93ce05a 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -115,8 +115,8 @@ TEST_F(OptimizerCSETest, Simple) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D;B->D:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_ThreeEquivalent) {
@@ -130,8 +130,8 @@ TEST_F(OptimizerCSETest, Simple_ThreeEquivalent) {
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);E(Mul)|"
-            "A->E;B->E:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_WithFixups) {
@@ -145,8 +145,8 @@ TEST_F(OptimizerCSETest, Simple_WithFixups) {
       "node { name: 'E' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['C', 'D'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul);E(Mul)|"
-            "A->D;B->D:1;D->E;D->E:1");
+            "A(Input);B(Input);C(Mul);E(Mul)|"
+            "A->C;B->C:1;C->E;C->E:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_Commutative) {
@@ -158,8 +158,8 @@ TEST_F(OptimizerCSETest, Simple_Commutative) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_FLOAT } }"
       " input: ['B', 'A'] }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D:1;B->D");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 static bool IsNotMultiply(const Node* n) { return n->type_string() != "Mul"; }
@@ -210,8 +210,8 @@ TEST_F(OptimizerCSETest, Simple_SameOps_SameAttrs1) {
       " input: ['A', 'B'] attr { key: 'shape'"
       "    value { shape: { dim: { size: 37 name: 'SAME_NAME' } } } } }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D;B->D:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, Simple_SameOps_SameAttrs2) {
@@ -229,8 +229,8 @@ TEST_F(OptimizerCSETest, Simple_SameOps_SameAttrs2) {
       "    attr { key: 't' value { type: DT_INT32 } }"
       "    attr { key: 'a' value { i: 3 } } }");
   EXPECT_EQ(DoCSE(),
-            "A(Input);B(Input);D(Mul)|"
-            "A->D;B->D:1");
+            "A(Input);B(Input);C(Mul)|"
+            "A->C;B->C:1");
 }
 
 TEST_F(OptimizerCSETest, SameConstants) {
@@ -249,8 +249,8 @@ TEST_F(OptimizerCSETest, SameConstants) {
       "node { name: 'D' op: 'Mul' attr { key: 'T' value { type: DT_INT32 } }"
       " input: ['A', 'B'] }");
   EXPECT_EQ(DoCSE(),
-            "B(Const);D(Mul)|"
-            "B->D;B->D:1");
+            "A(Const);D(Mul)|"
+            "A->D;A->D:1");
 }
 
 TEST_F(OptimizerCSETest, DifferentConstants) {
@@ -338,8 +338,8 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
             "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
   // In theory, there are 2^4 possible correct output of CSE.  In this
-  // test, it happens to eliminate the first 4 nodes.
-  EXPECT_EQ(DoCSE(), "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
+  // test, it happens to eliminate the last 4 nodes.
+  EXPECT_EQ(DoCSE(), "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const)|");
 }
 
 static void BM_CSE(int iters, int op_nodes) {
-- 
GitLab


From a452ef960840accab8d0d0afa72bd77ebdb0c83c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 08:33:36 -0700
Subject: [PATCH 098/610] Standardize shifts in multiplication util functions.

PiperOrigin-RevId: 198725578
---
 .../contrib/lite/kernels/internal/common.h    |   6 +-
 .../internal/optimized/optimized_ops.h        |  68 ++++++----
 .../internal/reference/reference_ops.h        | 120 +++++++++---------
 3 files changed, 108 insertions(+), 86 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/common.h b/tensorflow/contrib/lite/kernels/internal/common.h
index ede95dfee0..b86ca49c11 100644
--- a/tensorflow/contrib/lite/kernels/internal/common.h
+++ b/tensorflow/contrib/lite/kernels/internal/common.h
@@ -87,12 +87,12 @@ float ActivationFunction(float x) {
                                       output_activation_max);
 }
 
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
-    int32 x, int32 quantized_multiplier, int right_shift) {
+inline int32 MultiplyByQuantizedMultiplierSmallerThanOneExp(
+    int32 x, int32 quantized_multiplier, int left_shift) {
   using gemmlowp::RoundingDivideByPOT;
   using gemmlowp::SaturatingRoundingDoublingHighMul;
   return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
+      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
 inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d48178d608..f7011b28fd 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -51,6 +51,13 @@ using reference_ops::LessEqual;
 using reference_ops::RankOneSelect;
 using reference_ops::Select;
 
+// TODO(b/80247582) Remove this constant.
+// This will be phased out as the shifts are revised with more thought. Use of a
+// constant enables us to track progress on this work.
+//
+// Used mainly to convert from old-style shifts (right) to new-style (left).
+static constexpr int kReverseShift = -1;
+
 // Make a local VectorMap typedef allowing to map a float array
 // as a Eigen vector expression. The std::conditional here is to
 // construct the suitable Eigen type for the constness of the
@@ -2417,8 +2424,8 @@ inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
 
     for (int c = 0; c < depth; c++) {
       int32 diff = *input_data - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
-          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, kReverseShift * inv_l2norm_shift);
       int32 unclamped_output_val = 128 + rescaled_diff;
       int32 output_val = std::min(255, std::max(0, unclamped_output_val));
       *output_data = static_cast<uint8>(output_val);
@@ -2560,14 +2567,19 @@ inline void AddElementwise(int size, int left_shift, const uint8* input1_data,
     const int32 input2_val = input2_offset + input2_data[i];
     const int32 shifted_input1_val = input1_val * (1 << left_shift);
     const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier,
+            kReverseShift * input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier,
+            kReverseShift * input2_shift);
     const int32 raw_sum = scaled_input1_val + scaled_input2_val;
-    const int32 raw_output = MultiplyByQuantizedMultiplierSmallerThanOne(
-                                 raw_sum, output_multiplier, output_shift) +
-                             output_offset;
+    const int32 raw_output =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            raw_sum, output_multiplier, kReverseShift * output_shift) +
+        output_offset;
     const int32 clamped_output = std::min(
         output_activation_max, std::max(output_activation_min, raw_output));
     output_data[i] = static_cast<uint8>(clamped_output);
@@ -2786,15 +2798,17 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -3135,9 +3149,9 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
           const int32 input2_val =
               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
           const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val * input2_val, output_multiplier, output_shift);
+              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                  input1_val * input2_val, output_multiplier,
+                                  kReverseShift * output_shift);
           const int32 clamped_output =
               std::min(output_activation_max,
                        std::max(output_activation_min, unclamped_result));
@@ -3319,15 +3333,17 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sub = scaled_input1_val - scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sub, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sub, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -4782,9 +4798,9 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
         fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
     const int adjusted_diff_min =
         std::max(diff_min - 1,  // Note use of > below instead of >= above.
-                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
                      rescaled_diff_min, reverse_scaling_divisor,
-                     reverse_scaling_right_shift));
+                     kReverseShift * reverse_scaling_right_shift));
 
     for (int c = 0; c < depth; ++c) {
       int32 input_diff = static_cast<int32>(block_input_data[c]) - max_in_row;
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index c43c5f938e..ef055929a9 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -98,20 +98,12 @@ gemmlowp::FixedPoint<tRawType, tIntegerBits> SaturatingSub(
 
 namespace reference_ops {
 
-inline int32 MultiplyByQuantizedMultiplierSmallerThanOne(
-    int32 x, int32 quantized_multiplier, int right_shift) {
-  using gemmlowp::RoundingDivideByPOT;
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x, quantized_multiplier), right_shift);
-}
-
-inline int32 MultiplyByQuantizedMultiplierGreaterThanOne(
-    int32 x, int32 quantized_multiplier, int left_shift) {
-  using gemmlowp::SaturatingRoundingDoublingHighMul;
-  return SaturatingRoundingDoublingHighMul(x * (1 << left_shift),
-                                           quantized_multiplier);
-}
+// TODO(b/80247582) Remove this constant.
+// This will be phased out as the shifts are revised with more thought. Use of a
+// constant enables us to track progress on this work.
+//
+// Used mainly to convert from old-style shifts (right) to new-style (left).
+static constexpr int kReverseShift = -1;
 
 template <typename T>
 int CountLeadingZeros(T integer_input) {
@@ -422,8 +414,8 @@ inline void Conv(const uint8* input_data, const Dims<4>& input_dims,
           if (bias_data) {
             acc += bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
           }
-          acc = MultiplyByQuantizedMultiplierSmallerThanOne(
-              acc, output_multiplier, output_shift);
+          acc = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+              acc, output_multiplier, kReverseShift * output_shift);
           acc += output_offset;
           acc = std::max(acc, output_activation_min);
           acc = std::min(acc, output_activation_max);
@@ -646,8 +638,8 @@ inline void FullyConnected(const uint8* input_data, const Dims<4>& input_dims,
       if (bias_data) {
         acc += bias_data[Offset(bias_dims, out_c, 0, 0, 0)];
       }
-      acc = MultiplyByQuantizedMultiplierSmallerThanOne(acc, output_multiplier,
-                                                        output_shift);
+      acc = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          acc, output_multiplier, kReverseShift * output_shift);
       acc += output_offset;
       acc = std::max(acc, output_activation_min);
       acc = std::min(acc, output_activation_max);
@@ -1041,8 +1033,8 @@ inline void L2Normalization(const uint8* input_data, const Dims<4>& input_dims,
     for (int c = 0; c < depth; c++) {
       int32 diff =
           input_data[Offset(input_dims, c, i, 0, 0)] - input_zero_point;
-      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOne(
-          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32 rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, kReverseShift * inv_l2norm_shift);
       int32 unclamped_output_val = 128 + rescaled_diff;
       int32 output_val = std::min(255, std::max(0, unclamped_output_val));
       output_data[Offset(output_dims, c, i, 0, 0)] =
@@ -1113,15 +1105,17 @@ inline void Add(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -1267,15 +1261,17 @@ inline void BroadcastAdd(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sum = scaled_input1_val + scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sum, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sum, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -1320,15 +1316,17 @@ inline void BroadcastAddFivefold(
             const int32 shifted_input1_val = input1_val * (1 << left_shift);
             const int32 shifted_input2_val = input2_val * (1 << left_shift);
             const int32 scaled_input1_val =
-                MultiplyByQuantizedMultiplierSmallerThanOne(
-                    shifted_input1_val, input1_multiplier, input1_shift);
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    shifted_input1_val, input1_multiplier,
+                    kReverseShift * input1_shift);
             const int32 scaled_input2_val =
-                MultiplyByQuantizedMultiplierSmallerThanOne(
-                    shifted_input2_val, input2_multiplier, input2_shift);
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    shifted_input2_val, input2_multiplier,
+                    kReverseShift * input2_shift);
             const int32 raw_sum = scaled_input1_val + scaled_input2_val;
             const int32 raw_output =
-                MultiplyByQuantizedMultiplierSmallerThanOne(
-                    raw_sum, output_multiplier, output_shift) +
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    raw_sum, output_multiplier, kReverseShift * output_shift) +
                 output_offset;
             const int32 clamped_output =
                 std::min(output_activation_max,
@@ -1508,9 +1506,9 @@ inline void BroadcastMul(const uint8* input1_data, const Dims<4>& input1_dims,
           const int32 input2_val =
               input2_offset + input2_data[SubscriptToIndex(desc2, c, x, y, b)];
           const int32 unclamped_result =
-              output_offset +
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  input1_val * input2_val, output_multiplier, output_shift);
+              output_offset + MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                  input1_val * input2_val, output_multiplier,
+                                  kReverseShift * output_shift);
           const int32 clamped_output =
               std::min(output_activation_max,
                        std::max(output_activation_min, unclamped_result));
@@ -1724,15 +1722,17 @@ inline void BroadcastSub(int left_shift, const uint8* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           const int32 raw_sub = scaled_input1_val - scaled_input2_val;
           const int32 raw_output =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  raw_sub, output_multiplier, output_shift) +
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  raw_sub, output_multiplier, kReverseShift * output_shift) +
               output_offset;
           const int32 clamped_output =
               std::min(output_activation_max,
@@ -2944,9 +2944,9 @@ inline void LogSoftmax(const uint8* input_data, const Dims<4>& input_dims,
         fixed_log_sum_of_exps + std::numeric_limits<int32>::lowest();
     const int adjusted_diff_min =
         std::max(diff_min - 1,  // Note use of > below instead of >= above.
-                 MultiplyByQuantizedMultiplierSmallerThanOne(
+                 MultiplyByQuantizedMultiplierSmallerThanOneExp(
                      rescaled_diff_min, reverse_scaling_divisor,
-                     reverse_scaling_right_shift));
+                     kReverseShift * reverse_scaling_right_shift));
 
     for (int c = 0; c < depth; ++c) {
       int32 input_diff =
@@ -3850,10 +3850,14 @@ inline void Comparison(int left_shift, const T* input1_data,
     const int32 input2_val = input2_offset + input2_data[i];
     const int32 shifted_input1_val = input1_val * (1 << left_shift);
     const int32 shifted_input2_val = input2_val * (1 << left_shift);
-    const int32 scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input1_val, input1_multiplier, input1_shift);
-    const int32 scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOne(
-        shifted_input2_val, input2_multiplier, input2_shift);
+    const int32 scaled_input1_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input1_val, input1_multiplier,
+            kReverseShift * input1_shift);
+    const int32 scaled_input2_val =
+        MultiplyByQuantizedMultiplierSmallerThanOneExp(
+            shifted_input2_val, input2_multiplier,
+            kReverseShift * input2_shift);
     output_data[i] = F(scaled_input1_val, scaled_input2_val);
   }
 }
@@ -3902,11 +3906,13 @@ inline void BroadcastComparison(int left_shift, const T* input1_data,
           const int32 shifted_input1_val = input1_val * (1 << left_shift);
           const int32 shifted_input2_val = input2_val * (1 << left_shift);
           const int32 scaled_input1_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input1_val, input1_multiplier, input1_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input1_val, input1_multiplier,
+                  kReverseShift * input1_shift);
           const int32 scaled_input2_val =
-              MultiplyByQuantizedMultiplierSmallerThanOne(
-                  shifted_input2_val, input2_multiplier, input2_shift);
+              MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                  shifted_input2_val, input2_multiplier,
+                  kReverseShift * input2_shift);
           output_data[Offset(output_dims, c, x, y, b)] =
               F(scaled_input1_val, scaled_input2_val);
         }
-- 
GitLab


From f6a8cf82134a305f6d27368b2f51819b11195ada Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 31 May 2018 08:53:36 -0700
Subject: [PATCH 099/610] Cleanup: update continue_statements.py to use the
 base transformer facilities for tracking local state and reindenting node
 blocks. Rearrange the error handling in base transformer to avoid chained
 exceptions.

PiperOrigin-RevId: 198727946
---
 .../autograph/converters/break_statements.py  |  16 +-
 .../converters/continue_statements.py         | 174 ++++++++++--------
 .../contrib/autograph/pyct/transformer.py     | 148 ++++++++++++---
 .../autograph/pyct/transformer_test.py        |  42 ++++-
 4 files changed, 261 insertions(+), 119 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/break_statements.py b/tensorflow/contrib/autograph/converters/break_statements.py
index 5b7508c9a5..775d92c1d9 100644
--- a/tensorflow/contrib/autograph/converters/break_statements.py
+++ b/tensorflow/contrib/autograph/converters/break_statements.py
@@ -32,14 +32,6 @@ CONTROL_VAR_NAME = 'control_var_name'
 class BreakStatementTransformer(transformer.Base):
   """Canonicalizes break statements into additional conditionals."""
 
-  def _track_body(self, nodes, break_var):
-    self.enter_local_scope()
-    self.set_local(CONTROL_VAR_NAME, break_var)
-    nodes = self.visit_block(nodes)
-    break_used = self.get_local(BREAK_USED, False)
-    self.exit_local_scope()
-    return nodes, break_used
-
   def visit_Break(self, node):
     self.set_local(BREAK_USED, True)
     var_name = self.get_local(CONTROL_VAR_NAME)
@@ -65,6 +57,14 @@ class BreakStatementTransformer(transformer.Base):
         block=block)
     return node
 
+  def _track_body(self, nodes, break_var):
+    self.enter_local_scope()
+    self.set_local(CONTROL_VAR_NAME, break_var)
+    nodes = self.visit_block(nodes)
+    break_used = self.get_local(BREAK_USED, False)
+    self.exit_local_scope()
+    return nodes, break_used
+
   def visit_While(self, node):
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     break_var = self.context.namer.new_symbol('break_', scope.referenced)
diff --git a/tensorflow/contrib/autograph/converters/continue_statements.py b/tensorflow/contrib/autograph/converters/continue_statements.py
index 4299a8a9d5..0417817a77 100644
--- a/tensorflow/contrib/autograph/converters/continue_statements.py
+++ b/tensorflow/contrib/autograph/converters/continue_statements.py
@@ -24,103 +24,115 @@ from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-class ContinueCanonicalizationTransformer(transformer.Base):
-  """Canonicalizes continue statements into additional conditionals."""
+# Tags for local state.
+CONTROL_VAR_NAME = 'control_var_name'
+CONTINUE_USED = 'continue_used'
+GUARD_CREATED = 'guard_created'
+CREATE_GUARD_NEXT = 'create_guard_next'
 
-  def __init__(self, context):
-    super(ContinueCanonicalizationTransformer, self).__init__(context)
-    # This is a stack structure, to correctly process nested loops.
-    self.continuation_uses = []
 
-  def _create_continuation_check(self):
-    template = """
-      if not var_name:
-        pass
-    """
-    cond, = templates.replace(template, var_name=self.continuation_uses[-1][1])
-    cond.body = []
-    return cond
+class ContinueCanonicalizationTransformer(transformer.Base):
+  """Canonicalizes continue statements into additional conditionals."""
 
-  def _create_continuation_trigger(self):
+  def visit_Continue(self, node):
+    self.set_local(CONTINUE_USED, True)
     template = """
       var_name = True
     """
-    assign, = templates.replace(
-        template, var_name=self.continuation_uses[-1][1])
-    return assign
-
-  def _create_continuation_init(self):
-    template = """
-      var_name = False
-    """
-    assign, = templates.replace(
-        template, var_name=self.continuation_uses[-1][1])
-    return assign
-
-  def _visit_and_reindent_if_necessary(self, nodes):
-    reorganized_nodes = []
-    current_dest = reorganized_nodes
-    continue_used_in_block = False
-    for i, n in enumerate(nodes):
-      # TODO(mdan): This could be optimized if control structures are simple.
-      self.continuation_uses[-1][0] = False
-      n = self.visit(n)
-      current_dest.append(n)
-      if self.continuation_uses[-1][0]:
-        continue_used_in_block = True
-        if i < len(nodes) - 1:  # Last statement in block needs no protection.
-          cond = self._create_continuation_check()
-          current_dest.append(cond)
-          current_dest = cond.body
-    self.continuation_uses[-1][0] = continue_used_in_block
-    return reorganized_nodes
-
-  def _process_loop_block(self, block, scope):
-    cont_var = self.context.namer.new_symbol('cont_requested', scope.referenced)
-    self.continuation_uses.append([False, cont_var])
-    block = self._visit_and_reindent_if_necessary(block)
-    if self.continuation_uses[-1][0]:
-      block.insert(0, self._create_continuation_init())
-    self.continuation_uses.pop()
-    return block
+    return templates.replace(
+        template, var_name=self.get_local(CONTROL_VAR_NAME))
+
+  def _postprocess_statement(self, node):
+    # Example of how the state machine below works:
+    #
+    #   1| stmt           # State: CONTINUE_USED = False
+    #    |                # Action: none
+    #   2| if cond:
+    #   3|   continue     # State: CONTINUE_USED = True,
+    #    |                #        GUARD_CREATED = False,
+    #    |                #        CREATE_GUARD_NEXT = False
+    #    |                # Action: set CREATE_GUARD_NEXT = True
+    #   4| stmt           # State: CONTINUE_USED = True,
+    #    |                #        GUARD_CREATED = False,
+    #    |                #        CREATE_GUARD_NEXT = True
+    #    |                # Action: create `if not continue_used`,
+    #    |                #         set GUARD_CREATED = True
+    #   5| stmt           # State: CONTINUE_USED = True, GUARD_CREATED = True
+    #    |                # Action: none (will be wrapped under previously
+    #    |                #         created if node)
+
+    if self.get_local(CONTINUE_USED, False):
+      if self.get_local(GUARD_CREATED, False):
+        return node, None
+
+      elif not self.get_local(CREATE_GUARD_NEXT, False):
+        self.set_local(CREATE_GUARD_NEXT, True)
+        return node, None
+
+      else:
+        self.set_local(GUARD_CREATED, True)
+        template = """
+          if not var_name:
+            original_node
+        """
+        cond, = templates.replace(
+            template,
+            var_name=self.get_local(CONTROL_VAR_NAME),
+            original_node=node)
+        return cond, cond.body
+    return node, None
+
+  def _visit_loop_body(self, node, nodes):
+    self.enter_local_scope()
+    scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
+    continue_var = self.context.namer.new_symbol('continue_', scope.referenced)
+    self.set_local(CONTROL_VAR_NAME, continue_var)
+
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+
+    if self.get_local(CONTINUE_USED, False):
+      template = """
+        var_name = False
+      """
+      control_var_init = templates.replace(template, var_name=continue_var)
+      nodes = control_var_init + nodes
+
+    self.exit_local_scope()
+    return nodes
+
+  def _visit_non_loop_body(self, nodes):
+    self.enter_local_scope(inherit=(CONTROL_VAR_NAME,))
+    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
+    continue_used = self.get_local(CONTINUE_USED, False)
+    self.exit_local_scope(keep=(CONTINUE_USED,))
+    return nodes, continue_used
 
   def visit_While(self, node):
-    self.generic_visit(node.test)
-    node.body = self._process_loop_block(node.body,
-                                         anno.getanno(node,
-                                                      NodeAnno.BODY_SCOPE))
-    for n in node.orelse:
-      self.generic_visit(n)
+    node.test = self.visit(node.test)
+    node.body = self._visit_loop_body(node, node.body)
+    # A continue in the else clause applies to the containing scope.
+    node.orelse, _ = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_For(self, node):
-    self.generic_visit(node.target)
-    self.generic_visit(node.iter)
-    node.body = self._process_loop_block(node.body,
-                                         anno.getanno(node,
-                                                      NodeAnno.BODY_SCOPE))
-    for n in node.orelse:
-      self.generic_visit(n)
+    node.target = self.generic_visit(node.target)
+    node.iter = self.generic_visit(node.iter)
+    node.body = self._visit_loop_body(node, node.body)
+    # A continue in the else clause applies to the containing scope.
+    node.orelse, _ = self._visit_non_loop_body(node.orelse)
     return node
 
   def visit_If(self, node):
-    if self.continuation_uses:
-      self.generic_visit(node.test)
-      node.body = self._visit_and_reindent_if_necessary(node.body)
-      continue_used_in_body = self.continuation_uses[-1][0]
-      node.orelse = self._visit_and_reindent_if_necessary(node.orelse)
-      self.continuation_uses[-1][0] = (
-          continue_used_in_body or self.continuation_uses[-1][0])
-    else:
-      node = self.generic_visit(node)
+    node.test = self.generic_visit(node.test)
+    node.body, continue_used_body = self._visit_non_loop_body(node.body)
+    node.orelse, continue_used_orelse = self._visit_non_loop_body(node.orelse)
+    self.set_local(CONTINUE_USED, continue_used_body or continue_used_orelse)
     return node
 
-  def visit_Continue(self, node):
-    self.continuation_uses[-1][0] = True
-    return self._create_continuation_trigger()
-
-  def visit_Break(self, node):
-    assert False, 'break statement should be desugared at this point'
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body, _ = self._visit_non_loop_body(node.body)
+    return node
 
 
 def transform(node, namer):
diff --git a/tensorflow/contrib/autograph/pyct/transformer.py b/tensorflow/contrib/autograph/pyct/transformer.py
index 4c65edb6de..60bca8b38d 100644
--- a/tensorflow/contrib/autograph/pyct/transformer.py
+++ b/tensorflow/contrib/autograph/pyct/transformer.py
@@ -70,14 +70,40 @@ class Base(gast.NodeTransformer):
     return tuple(self._enclosing_entities)
 
   @property
-  def locel_scope_level(self):
+  def local_scope_level(self):
     return len(self._local_scope_state)
 
-  def enter_local_scope(self):
-    self._local_scope_state.append({})
+  def enter_local_scope(self, inherit=None):
+    """Marks entry into a new local scope.
 
-  def exit_local_scope(self):
-    return self._local_scope_state.pop()
+    Args:
+      inherit: Optional enumerable of variable names to copy from the
+          parent scope.
+    """
+    scope_entered = {}
+    if inherit:
+      this_scope = self._local_scope_state[-1]
+      for name in inherit:
+        if name in this_scope:
+          scope_entered[name] = this_scope[name]
+    self._local_scope_state.append(scope_entered)
+
+  def exit_local_scope(self, keep=None):
+    """Marks exit from the current local scope.
+
+    Args:
+      keep: Optional enumerable of variable names to copy into the
+          parent scope.
+    Returns:
+      A dict containing the scope that has just been exited.
+    """
+    scope_left = self._local_scope_state.pop()
+    if keep:
+      this_scope = self._local_scope_state[-1]
+      for name in keep:
+        if name in scope_left:
+          this_scope[name] = scope_left[name]
+    return scope_left
 
   def set_local(self, name, value):
     self._local_scope_state[-1][name] = value
@@ -91,16 +117,76 @@ class Base(gast.NodeTransformer):
       print(pretty_printer.fmt(node))
     return node
 
-  def visit_block(self, nodes):
-    """Helper equivalent to generic_visit, but for node lists."""
+  def visit_block(self, nodes, before_visit=None, after_visit=None):
+    """A more powerful version of generic_visit for statement blocks.
+
+    An example of a block is the body of an if statement.
+
+    This function allows specifying a postprocessing callback (the
+    after_visit argument) argument which can be used to move nodes to a new
+    destination. This is done by after_visit by returning a non-null
+    second return value, e.g. return new_node, new_destination.
+
+    For example, a transformer could perform the following move:
+
+        foo()
+        bar()
+        baz()
+
+        foo()
+        if cond:
+          bar()
+          baz()
+
+    The above could be done with a postprocessor of this kind:
+
+        def after_visit(node):
+          if node_is_function_call(bar):
+            new_container_node = build_cond()
+            new_container_node.body.append(node)
+            return new_container_node, new_container_node.body
+          else:
+            # Once we set a new destination, all subsequent items will be
+            # moved to it, so we don't need to explicitly handle baz.
+            return node, None
+
+    Args:
+      nodes: enumerable of AST node objects
+      before_visit: optional callable that is called before visiting each item
+          in nodes
+      after_visit: optional callable that takes in an AST node and
+          returns a tuple (new_node, new_destination). It is called after
+          visiting each item in nodes. Is used in the same was as the
+          visit_* methods: new_node will replace the node; if not None,
+          new_destination must be a list, and subsequent nodes will be placed
+          in this list instead of the list returned by visit_block.
+    Returns:
+      A list of AST node objects containing the transformed items fron nodes,
+      except those nodes that have been relocated using after_visit.
+    """
     results = []
+    node_destination = results
     for node in nodes:
+      if before_visit:
+        # TODO(mdan): We can modify node here too, if ever needed.
+        before_visit()
+
       replacement = self.visit(node)
+
+      if after_visit and replacement:
+        replacement, new_destination = after_visit(replacement)
+      else:
+        new_destination = None
+
       if replacement:
         if isinstance(replacement, (list, tuple)):
-          results.extend(replacement)
+          node_destination.extend(replacement)
         else:
-          results.append(replacement)
+          node_destination.append(replacement)
+
+      # Allow the postprocessor to reroute the remaining nodes to a new list.
+      if new_destination is not None:
+        node_destination = new_destination
     return results
 
   # TODO(mdan): Once we have error tracing, we may be able to just go to SSA.
@@ -155,22 +241,39 @@ class Base(gast.NodeTransformer):
     source_code = self.context.source_code
     source_file = self.context.source_file
     did_enter_function = False
-    local_scope_state_size = len(self._local_scope_state)
+    local_scope_size_at_entry = len(self._local_scope_state)
 
     try:
       if isinstance(node, (gast.FunctionDef, gast.ClassDef, gast.Lambda)):
-        self._enclosing_entities.append(node)
         did_enter_function = True
 
+      if did_enter_function:
+        self._enclosing_entities.append(node)
+
       if source_code and hasattr(node, 'lineno'):
         self._lineno = node.lineno
         self._col_offset = node.col_offset
-      if anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
-        return node
-      return super(Base, self).visit(node)
 
-    except (ValueError, AttributeError, KeyError, NotImplementedError,
-            AssertionError) as e:
+      if not anno.hasanno(node, anno.Basic.SKIP_PROCESSING):
+        result = super(Base, self).visit(node)
+
+      # On exception, the local scope integrity is not guaranteed.
+      if did_enter_function:
+        self._enclosing_entities.pop()
+
+      if local_scope_size_at_entry != len(self._local_scope_state):
+        raise AssertionError(
+            'Inconsistent local scope stack. Before entering node %s, the'
+            ' stack had length %d, after exit it has length %d. This'
+            ' indicates enter_local_scope and exit_local_scope are not'
+            ' well paired.' % (
+                node,
+                local_scope_size_at_entry,
+                len(self._local_scope_state)
+            ))
+      return result
+
+    except (ValueError, AttributeError, KeyError, NotImplementedError) as e:
       msg = '%s: %s\nOffending source:\n%s\n\nOccurred at node:\n%s' % (
           e.__class__.__name__, str(e), try_ast_to_source(node),
           pretty_printer.fmt(node, color=False))
@@ -178,18 +281,11 @@ class Base(gast.NodeTransformer):
         line = source_code.splitlines()[self._lineno - 1]
       else:
         line = '<no source available>'
+      # TODO(mdan): Avoid the printing of the original exception.
+      # In other words, we need to find how to suppress the "During handling
+      # of the above exception, another exception occurred" message.
       six.reraise(AutographParseError,
                   AutographParseError(
                       msg,
                       (source_file, self._lineno, self._col_offset + 1, line)),
                   sys.exc_info()[2])
-    finally:
-      if did_enter_function:
-        self._enclosing_entities.pop()
-
-      if local_scope_state_size != len(self._local_scope_state):
-        raise AssertionError(
-            'Inconsistent local scope stack. Before entering node %s, the'
-            ' stack had length %d, after exit it has length %d. This'
-            ' indicates enter_local_scope and exit_local_scope are not'
-            ' well paired.')
diff --git a/tensorflow/contrib/autograph/pyct/transformer_test.py b/tensorflow/contrib/autograph/pyct/transformer_test.py
index 1f1adf4fbd..f110e79605 100644
--- a/tensorflow/contrib/autograph/pyct/transformer_test.py
+++ b/tensorflow/contrib/autograph/pyct/transformer_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import gast
+
 from tensorflow.contrib.autograph.pyct import anno
 from tensorflow.contrib.autograph.pyct import context
 from tensorflow.contrib.autograph.pyct import parser
@@ -27,7 +29,7 @@ from tensorflow.python.platform import test
 
 class TransformerTest(test.TestCase):
 
-  def _context_for_nodetesting(self):
+  def _context_for_testing(self):
     return context.EntityContext(
         namer=None,
         source_code=None,
@@ -53,7 +55,7 @@ class TransformerTest(test.TestCase):
         anno.setanno(node, 'enclosing_entities', self.enclosing_entities)
         return self.generic_visit(node)
 
-    tr = TestTransformer(self._context_for_nodetesting())
+    tr = TestTransformer(self._context_for_testing())
 
     def test_function():
       a = 0
@@ -116,7 +118,7 @@ class TransformerTest(test.TestCase):
       def visit_For(self, node):
         return self._annotate_result(node)
 
-    tr = TestTransformer(self._context_for_nodetesting())
+    tr = TestTransformer(self._context_for_testing())
 
     def test_function(a):
       """Docstring."""
@@ -155,7 +157,7 @@ class TransformerTest(test.TestCase):
         self.exit_local_scope()
         return node
 
-    tr = TestTransformer(self._context_for_nodetesting())
+    tr = TestTransformer(self._context_for_testing())
 
     def no_exit(a):
       if a > 0:
@@ -174,6 +176,38 @@ class TransformerTest(test.TestCase):
     with self.assertRaises(AssertionError):
       tr.visit(node)
 
+  def test_visit_block_postprocessing(self):
+
+    class TestTransformer(transformer.Base):
+
+      def _process_body_item(self, node):
+        if isinstance(node, gast.Assign) and (node.value.id == 'y'):
+          if_node = gast.If(gast.Name('x', gast.Load(), None), [node], [])
+          return if_node, if_node.body
+        return node, None
+
+      def visit_FunctionDef(self, node):
+        node.body = self.visit_block(
+            node.body, after_visit=self._process_body_item)
+        return node
+
+    def test_function(x, y):
+      z = x
+      z = y
+      return z
+
+    tr = TestTransformer(self._context_for_testing())
+
+    node, _ = parser.parse_entity(test_function)
+    node = tr.visit(node)
+    node = node.body[0]
+
+    self.assertEqual(len(node.body), 2)
+    self.assertTrue(isinstance(node.body[0], gast.Assign))
+    self.assertTrue(isinstance(node.body[1], gast.If))
+    self.assertTrue(isinstance(node.body[1].body[0], gast.Assign))
+    self.assertTrue(isinstance(node.body[1].body[1], gast.Return))
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 398e19000b842c4aa61f05fdd68e307afdc7ff67 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 31 May 2018 09:43:30 -0700
Subject: [PATCH 100/610] Another handle_data fix for graph-mode functions.

PiperOrigin-RevId: 198734229
---
 tensorflow/python/framework/function.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 0675222016..259cab6699 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -718,8 +718,12 @@ class _FuncGraph(ops.Graph):
           tensor.dtype, shape=tensor.get_shape(), name=name)
     # pylint: disable=protected-access
     if ops._USE_C_SHAPES:
-      handle_data = c_api.GetResourceHandleShapeAndType(tensor.graph._c_graph,
-                                                        tensor._as_tf_output())
+      if isinstance(tensor, ops.EagerTensor):
+        handle_data = tensor._handle_data
+      else:
+        handle_data = c_api.GetResourceHandleShapeAndType(
+            tensor.graph._c_graph, tensor._as_tf_output())
+
       if handle_data:
         c_api.SetResourceHandleShapeAndType(ph.graph._c_graph,
                                             ph._as_tf_output(),
-- 
GitLab


From 50fde7b75af1aa813c52f521613199de745208a9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 10:15:59 -0700
Subject: [PATCH 101/610] Introduce runtime shape class.

PiperOrigin-RevId: 198739017
---
 .../contrib/lite/kernels/internal/types.h     | 100 +++++++++++++++++-
 1 file changed, 99 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index d5293edd56..98ca21d55a 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -1,4 +1,4 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
 #define TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
 
+#include <cstring>
+#include <iterator>
+
 #include "tensorflow/contrib/lite/kernels/internal/compatibility.h"
 
 namespace tflite {
@@ -44,6 +47,101 @@ struct Dims {
   int strides[N];
 };
 
+class RuntimeShape {
+ public:
+  // Shapes with dimensions up to 4 are stored directly in the structure, while
+  // larger shapes are separately allocated.
+  static constexpr int kMaxSmallSize = 4;
+
+  RuntimeShape() : size_(0) {}
+
+  explicit RuntimeShape(int dimensions_count) : size_(dimensions_count) {
+    if (dimensions_count > kMaxSmallSize) {
+      dims_pointer_ = new int32[dimensions_count];
+    }
+  }
+
+  RuntimeShape(int dimensions_count, const int32* dims_data) : size_(0) {
+    ReplaceWith(dimensions_count, dims_data);
+  }
+
+  ~RuntimeShape() {
+    if (size_ > kMaxSmallSize) {
+      delete[] dims_pointer_;
+    }
+  }
+
+  inline const int32 DimensionsCount() const { return size_; }
+  inline const int32 Dims(int i) const {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
+  }
+  inline void SetDim(int i, int32 val) {
+    TFLITE_DCHECK_GE(i, 0);
+    TFLITE_DCHECK_LT(i, size_);
+    if (size_ > kMaxSmallSize) {
+      dims_pointer_[i] = val;
+    } else {
+      dims_[i] = val;
+    }
+  }
+  inline int32* DimsData() {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+  inline const int32* DimsData() const {
+    return size_ > kMaxSmallSize ? dims_pointer_ : dims_;
+  }
+
+  inline void Resize(int dimensions_count) {
+    if (size_ > kMaxSmallSize) {
+      delete[] dims_pointer_;
+    }
+    size_ = dimensions_count;
+    if (dimensions_count > kMaxSmallSize) {
+      dims_pointer_ = new int32[dimensions_count];
+    }
+  }
+
+  inline void ReplaceWith(int dimensions_count, const int32* dims_data) {
+    Resize(dimensions_count);
+    int32* dst_dims = DimsData();
+    std::memcpy(dst_dims, dims_data, dimensions_count * sizeof(int32));
+  }
+
+  template <typename T>
+  inline void BuildFrom(const T& src_iterable) {
+    const int dimensions_count =
+        std::distance(src_iterable.begin(), src_iterable.end());
+    Resize(dimensions_count);
+    int32* data = DimsData();
+    for (auto it : src_iterable) {
+      *data = it;
+      ++data;
+    }
+  }
+
+  // Returns the total count of elements, that is the size when flattened into a
+  // vector.
+  inline const int FlatSize() const {
+    int buffer_size = 1;
+    const int* dims_data = DimsData();
+    for (int i = 0; i < size_; i++) {
+      const int dim = dims_data[i];
+      TFLITE_DCHECK_GE(dim, 1);
+      buffer_size *= dim;
+    }
+    return buffer_size;
+  }
+
+ private:
+  int32 size_;
+  union {
+    int32 dims_[kMaxSmallSize];
+    int32* dims_pointer_;
+  };
+};
+
 // Gets next index to iterate through a multidimensional array.
 inline bool NextIndex(const int num_dims, const int* dims, int* current) {
   TFLITE_DCHECK_GT(num_dims, 0);
-- 
GitLab


From 3ff633d9797d173d65523453de589cbbcf6e32ce Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 10:20:00 -0700
Subject: [PATCH 102/610] Suppress generation of the proto API's descriptor()
 method, it conflicts with the field name.

PiperOrigin-RevId: 198739727
---
 tensorflow/tools/api/lib/api_objects.proto | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/api/lib/api_objects.proto b/tensorflow/tools/api/lib/api_objects.proto
index 7dcde0bbc3..7207b9c5a9 100644
--- a/tensorflow/tools/api/lib/api_objects.proto
+++ b/tensorflow/tools/api/lib/api_objects.proto
@@ -27,6 +27,10 @@ message TFAPIClass {
 };
 
 message TFAPIProto {
+  // Suppress generation of the proto API's descriptor() method lest it
+  // conflict with the standard accessor for the field having the same name.
+  option no_standard_descriptor_accessor = true;
+
   optional google.protobuf.DescriptorProto descriptor = 1;
 };
 
-- 
GitLab


From 0d697e5fc4c05c699eea0764364104ea500ccc68 Mon Sep 17 00:00:00 2001
From: Jesse Benson <jessebenson@users.noreply.github.com>
Date: Thu, 31 May 2018 10:35:15 -0700
Subject: [PATCH 103/610] Build libtensorflow.so and libtensorflow_framework.so
 for Raspberry Pi. (#18892)

---
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index e27e33c2de..cbd4a93e6d 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -103,6 +103,8 @@ bazel build -c opt ${PI_COPTS} \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
   --distinct_host_configuration=true \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
@@ -119,6 +121,8 @@ SUB='s/tensorflow-([^-]+)-([^-]+)-.*/tensorflow-\1-\2-none-'${WHEEL_ARCH}'.whl/;
 NEW_FN=$(echo "${OLD_FN}" | perl -ne "${SUB}")
 mv "${OUTDIR}/${OLD_FN}" "${OUTDIR}/${NEW_FN}"
 cp bazel-bin/tensorflow/tools/benchmark/benchmark_model "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow.so "${OUTDIR}"
+cp bazel-bin/tensorflow/libtensorflow_framework.so "${OUTDIR}"
 
 echo "Output can be found here:"
 find "${OUTDIR}"
-- 
GitLab


From f50b61fffb7a65688899a625b689387653c5c798 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 31 May 2018 10:33:53 -0700
Subject: [PATCH 104/610] Initial implementation of a few of the list-specific
 operators. This introduces an abstraction for a dispatch context, which
 allows passing local information through to the specialized operators.

PiperOrigin-RevId: 198742074
---
 tensorflow/contrib/autograph/operators/BUILD  |  12 +-
 .../contrib/autograph/operators/__init__.py   |  13 +
 .../autograph/operators/data_structures.py    | 249 ++++++++++++++++--
 .../operators/data_structures_test.py         |  87 +++++-
 .../contrib/autograph/operators/slices.py     | 133 ++++++++++
 .../autograph/operators/slices_test.py        |  51 ++++
 6 files changed, 518 insertions(+), 27 deletions(-)
 create mode 100644 tensorflow/contrib/autograph/operators/slices.py
 create mode 100644 tensorflow/contrib/autograph/operators/slices_test.py

diff --git a/tensorflow/contrib/autograph/operators/BUILD b/tensorflow/contrib/autograph/operators/BUILD
index 18bfec5d9c..0c6ab65505 100644
--- a/tensorflow/contrib/autograph/operators/BUILD
+++ b/tensorflow/contrib/autograph/operators/BUILD
@@ -22,7 +22,7 @@ py_library(
         "__init__.py",
         "control_flow.py",
         "data_structures.py",
-        "dispatch_context.py",
+        "slices.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -52,3 +52,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "slices_test",
+    srcs = ["slices_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":operators",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/operators/__init__.py b/tensorflow/contrib/autograph/operators/__init__.py
index 38b761d97d..c900fd6af2 100644
--- a/tensorflow/contrib/autograph/operators/__init__.py
+++ b/tensorflow/contrib/autograph/operators/__init__.py
@@ -28,6 +28,10 @@ closures for the body.
 #    - the names used in the Python docs, if the operator is a function (e.g.
 #      list_ and x for append, see
 #      https://docs.python.org/3.7/tutorial/datastructures.html)
+#
+# All operators may accept a final argument named "opts", of a type that
+# subclasses namedtuple and contains any arguments that are only required
+# for some specializations of the operator.
 
 from __future__ import absolute_import
 from __future__ import division
@@ -35,3 +39,12 @@ from __future__ import print_function
 
 from tensorflow.contrib.autograph.operators.control_flow import for_stmt
 from tensorflow.contrib.autograph.operators.control_flow import while_stmt
+from tensorflow.contrib.autograph.operators.data_structures import list_append
+from tensorflow.contrib.autograph.operators.data_structures import list_pop
+from tensorflow.contrib.autograph.operators.data_structures import list_stack
+from tensorflow.contrib.autograph.operators.data_structures import ListPopOpts
+from tensorflow.contrib.autograph.operators.data_structures import ListStackOpts
+from tensorflow.contrib.autograph.operators.data_structures import new_list
+from tensorflow.contrib.autograph.operators.slices import get_item
+from tensorflow.contrib.autograph.operators.slices import GetItemOpts
+from tensorflow.contrib.autograph.operators.slices import set_item
diff --git a/tensorflow/contrib/autograph/operators/data_structures.py b/tensorflow/contrib/autograph/operators/data_structures.py
index c862306baa..06d8727b0f 100644
--- a/tensorflow/contrib/autograph/operators/data_structures.py
+++ b/tensorflow/contrib/autograph/operators/data_structures.py
@@ -18,39 +18,250 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import variables
+
+
+# TODO(mdan): Once control flow supports objects, repackage as a class.
+
+
+def new_list(iterable=None):
+  """The list constructor.
+
+  Args:
+    iterable: Optional elements to fill the list with.
+
+  Returns:
+    A list-like object. The exact return value depends on the initial elements.
+  """
+  if iterable:
+    elements = tuple(iterable)
+  else:
+    elements = ()
+
+  # TODO(mdan): Extend these criteria.
+  if any(isinstance(el, variables.Variable) for el in elements):
+    return _py_list_new(elements)
+  return _tf_tensor_list_new(elements)
 
-# TODO(mdan): Add support for TensorList once functional.
-# TODO(mdan): Add primitives for empty list, list with elements.
 
+def _tf_tensor_list_new(elements):
+  """Overload of new_list that stages a Tensor list creation."""
+  elements = tuple(ops.convert_to_tensor(el) for el in elements)
+  all_dtypes = set(el.dtype for el in elements)
+  if len(all_dtypes) == 1:
+    element_dtype = tuple(all_dtypes)[0]
+  else:
+    # Heterogeneous lists are ok.
+    element_dtype = dtypes.variant
+
+  # TODO(mdan): This may fail for elements of variable shapes.
+  all_shapes = set(tuple(el.shape.as_list()) for el in elements)
+  if len(all_shapes) == 1:
+    element_shape = array_ops.shape(elements[0])
+  else:
+    # Heterogeneous lists are ok.
+    element_shape = constant_op.constant(-1)  # unknown shape, by convention
+
+  l = list_ops.empty_tensor_list(
+      element_shape=element_shape, element_dtype=element_dtype)
+  for el in elements:
+    l = list_ops.tensor_list_push_back(l, el)
+  return l
 
-def append(target, element):
+
+def _py_list_new(elements):
+  """Overload of new_list that creates a Python list."""
+  return list(elements)
+
+
+def list_append(list_, x):
   """The list append function.
 
-  Note: it is unspecified where target will be mutated or not. If target is
-  a TensorFlow entity, it will not be typically mutated. If target is a plain
-  list, it will be. In general, if the target is mutated then the return value
+  Note: it is unspecified where list_ will be mutated or not. If list_ is
+  a TensorFlow entity, it will not be typically mutated. If list_ is a plain
+  list, it will be. In general, if the list is mutated then the return value
   should point to the original entity.
 
   Args:
-    target: An entity that supports append semantics.
-    element: The element to append.
+    list_: An entity that supports append semantics.
+    x: The element to append.
 
   Returns:
-    Same as target, after the append was performed.
+    Same as list_, after the append was performed.
+
+  Raises:
+    ValueError: if list_ is not of a known list-like type.
   """
-  if isinstance(target, tensor_array_ops.TensorArray):
-    return _tf_tensorarray_append(target, element)
+  if isinstance(list_, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_append(list_, x)
+  elif tensor_util.is_tensor(list_):
+    if list_.dtype == dtypes.variant:
+      return _tf_tensor_list_append(list_, x)
+    else:
+      raise ValueError(
+          'tensor lists are expected to be Tensors with dtype=tf.variant,'
+          ' instead found %s' % list_)
   else:
-    return _py_append(target, element)
+    return _py_list_append(list_, x)
+
+
+def _tf_tensor_list_append(list_, x):
+  """Overload of list_append that stages a Tensor list write."""
+  def empty_list_of_elements_like_x():
+    tensor_x = ops.convert_to_tensor(x)
+    return list_ops.empty_tensor_list(
+        element_shape=array_ops.shape(tensor_x),
+        element_dtype=tensor_x.dtype)
+
+  list_ = control_flow_ops.cond(
+      list_ops.tensor_list_length(list_) > 0,
+      lambda: list_,
+      empty_list_of_elements_like_x,
+  )
+  return list_ops.tensor_list_push_back(list_, x)
+
+
+def _tf_tensorarray_append(list_, x):
+  """Overload of list_append that stages a TensorArray write."""
+  return list_.write(list_.size(), x)
+
+
+def _py_list_append(list_, x):
+  """Overload of list_append that executes a Python list append."""
+  # Revert to the original call.
+  list_.append(x)
+  return list_
+
+
+class ListPopOpts(
+    collections.namedtuple('ListPopOpts', ('element_dtype', 'element_shape'))):
+  pass
+
+
+def list_pop(list_, i, opts):
+  """The list pop function.
+
+  Note: it is unspecified where list_ will be mutated or not. If list_ is
+  a TensorFlow entity, it will not be typically mutated. If list_ is a plain
+  list, it will be. In general, if the list is mutated then the return value
+  should point to the original entity.
+
+  Args:
+    list_: An entity that supports pop semantics.
+    i: Optional index to pop from. May be None.
+    opts: A ListPopOpts.
+
+  Returns:
+    Tuple (x, out_list_):
+      out_list_: same as list_, after the removal was performed.
+      x: the removed element value.
+
+  Raises:
+    ValueError: if list_ is not of a known list-like type or the operation is
+    not supported for that type.
+  """
+  assert isinstance(opts, ListPopOpts)
+
+  if isinstance(list_, tensor_array_ops.TensorArray):
+    raise ValueError('TensorArray does not support item removal')
+  elif tensor_util.is_tensor(list_):
+    if list_.dtype == dtypes.variant:
+      return _tf_tensor_list_pop(list_, i, opts)
+    else:
+      raise ValueError(
+          'tensor lists are expected to be Tensors with dtype=tf.variant,'
+          ' instead found %s' % list_)
+  else:
+    return _py_list_pop(list_, i)
+
+
+def _tf_tensor_list_pop(list_, i, opts):
+  """Overload of list_pop that stages a Tensor list pop."""
+  if i is not None:
+    raise NotImplementedError('tensor lists only support removing from the end')
+
+  if opts.element_dtype is None:
+    raise ValueError('cannot pop from a list without knowing its element '
+                     'type; use set_element_type to annotate it')
+  if opts.element_shape is None:
+    raise ValueError('cannot pop from a list without knowing its element '
+                     'shape; use set_element_type to annotate it')
+  list_out, x = list_ops.tensor_list_pop_back(
+      list_, element_dtype=opts.element_dtype)
+  x.set_shape(opts.element_shape)
+  return list_out, x
+
+
+def _py_list_pop(list_, i):
+  """Overload of list_pop that executes a Python list append."""
+  if i is None:
+    x = list_.pop()
+  else:
+    x = list_.pop(i)
+  return list_, x
+
+
+# TODO(mdan): Look into reducing duplication between all these containers.
+class ListStackOpts(
+    collections.namedtuple('ListStackOpts',
+                           ('element_dtype', 'original_call'))):
+  pass
+
+
+def list_stack(list_, opts):
+  """The list stack function.
+
+  This does not have a direct correspondent in Python. The closest idiom to
+  this is tf.append or np.stack. It's different from those in the sense that it
+  accepts a Tensor list, rather than a list of tensors. It can also accept
+  TensorArray. When the target is anything else, the dispatcher will rely on
+  ctx.original_call for fallback.
+
+  Args:
+    list_: An entity that supports append semantics.
+    opts: A ListStackOpts object.
+
+  Returns:
+    The output of the stack operation, typically a Tensor.
+  """
+  assert isinstance(opts, ListStackOpts)
+
+  if isinstance(list_, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_stack(list_)
+  elif tensor_util.is_tensor(list_):
+    if list_.dtype == dtypes.variant:
+      return _tf_tensor_list_stack(list_, opts)
+    else:
+      # No-op for primitive Tensor arguments.
+      return list_
+  else:
+    return _py_list_stack(list_, opts)
+
+
+def _tf_tensorarray_stack(list_):
+  """Overload of list_stack that stages a TensorArray stack."""
+  return list_.stack()
 
 
-def _tf_tensorarray_append(target, element):
-  """Overload of append that stages a TensorArray write at the last position."""
-  return target.write(target.size(), element)
+def _tf_tensor_list_stack(list_, opts):
+  """Overload of list_stack that stages a Tensor list write."""
+  if opts.element_dtype is None:
+    raise ValueError('cannot stack a list without knowing its element type;'
+                     ' use set_element_type to annotate it')
+  return list_ops.tensor_list_stack(list_, element_dtype=opts.element_dtype)
 
 
-def _py_append(target, element):
-  """Overload of append that executes a Python list append."""
-  target.append(element)
-  return target
+def _py_list_stack(list_, opts):
+  """Overload of list_stack that executes a Python list append."""
+  # Revert to the original call.
+  return opts.original_call(list_)
diff --git a/tensorflow/contrib/autograph/operators/data_structures_test.py b/tensorflow/contrib/autograph/operators/data_structures_test.py
index 577d28c34d..8bbb52d6c1 100644
--- a/tensorflow/contrib/autograph/operators/data_structures_test.py
+++ b/tensorflow/contrib/autograph/operators/data_structures_test.py
@@ -19,25 +19,98 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.autograph.operators import data_structures
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
 
 
-class AppendTest(test.TestCase):
+class ListTest(test.TestCase):
 
-  def test_tf_tensorarray(self):
+  def test_new_list_empty(self):
+    l = data_structures.new_list()
+    # Can't evaluate an empty list.
+    # TODO(mdan): sess.run should allow tf.variant maybe?
+    self.assertTrue(isinstance(l, ops.Tensor))
+
+  def test_new_list_tensor(self):
+    l = data_structures.new_list([3, 4, 5])
+    t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4, 5])
+
+  def test_append_tensor_list(self):
+    l = data_structures.new_list()
+    x = constant_op.constant([1, 2, 3])
+    l = data_structures.list_append(l, x)
+
+    t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [[1, 2, 3]])
+
+  def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
-    l1 = data_structures.append(l, 1)
-    l2 = data_structures.append(l1, 2)
+    l1 = data_structures.list_append(l, 1)
+    l2 = data_structures.list_append(l1, 2)
     with self.test_session() as sess:
       self.assertAllEqual(sess.run(l1.stack()), [1])
       self.assertAllEqual(sess.run(l2.stack()), [1, 2])
 
-  def test_python(self):
+  def test_append_python(self):
     l = []
-    self.assertAllEqual(data_structures.append(l, 1), [1])
-    self.assertAllEqual(data_structures.append(l, 2), [1, 2])
+    self.assertAllEqual(data_structures.list_append(l, 1), [1])
+    self.assertAllEqual(data_structures.list_append(l, 2), [1, 2])
+
+  def test_pop_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+
+    opts = data_structures.ListPopOpts(
+        element_dtype=initial_list.dtype,
+        element_shape=(2,))
+
+    with self.assertRaises(NotImplementedError):
+      data_structures.list_pop(l, 0, opts)
+
+    with self.test_session() as sess:
+      l, x = data_structures.list_pop(l, None, opts)
+      self.assertAllEqual(sess.run(x), [3, 4])
+
+      t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
+      self.assertAllEqual(sess.run(t), [[1, 2]])
+
+  def test_pop_python(self):
+    l = [1, 2, 3]
+    opts = data_structures.ListPopOpts(element_dtype=None, element_shape=())
+    self.assertAllEqual(data_structures.list_pop(l, None, opts), ([1, 2], 3))
+    self.assertAllEqual(data_structures.list_pop(l, None, opts), ([1], 2))
+
+  def test_stack_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+
+    opts = data_structures.ListStackOpts(
+        element_dtype=initial_list.dtype, original_call=None)
+
+    with self.test_session() as sess:
+      t = data_structures.list_stack(l, opts)
+      self.assertAllEqual(sess.run(t), sess.run(initial_list))
+
+  def test_stack_fallback(self):
+
+    def dummy_function(l):
+      # Lazy person's mock: just transform the argument in a way in which we
+      # can check that this function was indeed called.
+      return [x * 2 for x in l]
+
+    opts = data_structures.ListStackOpts(
+        element_dtype=None, original_call=dummy_function)
+
+    self.assertAllEqual(data_structures.list_stack([1, 2], opts), [2, 4])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/operators/slices.py b/tensorflow/contrib/autograph/operators/slices.py
new file mode 100644
index 0000000000..04fbeb2f6e
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/slices.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operators specific to slicing operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import list_ops
+from tensorflow.python.ops import tensor_array_ops
+
+
+# TODO(mdan): Support extended slices.
+
+
+class GetItemOpts(collections.namedtuple('GetItemOpts', ('element_dtype',))):
+  pass
+
+
+def get_item(target, i, opts):
+  """The slice read operator (i.e. __getitem__).
+
+  Note: it is unspecified whether target will be mutated or not. In general,
+  if target is mutable (like Python lists), it will be mutated.
+
+  Args:
+    target: An entity that supports getitem semantics.
+    i: Index to read from.
+    opts: A GetItemOpts object.
+
+  Returns:
+    The read element.
+
+  Raises:
+    ValueError: if target is not of a supported type.
+  """
+  assert isinstance(opts, GetItemOpts)
+
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_get_item(target, i)
+  elif tensor_util.is_tensor(target):
+    if target.dtype == dtypes.variant:
+      return _tf_tensor_list_get_item(target, i, opts)
+    else:
+      return _tf_tensor_get_item(target, i)
+  else:
+    return _py_get_item(target, i)
+
+
+def _tf_tensorarray_get_item(target, i):
+  """Overload of get_item that stages a TensorArray read."""
+  return target.read(i)
+
+
+def _tf_tensor_list_get_item(target, i, opts):
+  """Overload of get_item that stages a Tensor list read."""
+  if opts.element_dtype is None:
+    raise ValueError('cannot retrieve from a list without knowing its '
+                     'element type; use set_element_type to annotate it')
+  x = list_ops.tensor_list_get_item(target, i, element_dtype=opts.element_dtype)
+  return x
+
+
+def _tf_tensor_get_item(target, i):
+  """Overload of get_item that stages a Tensor (not Tensor list) read."""
+  return target[i]
+
+
+def _py_get_item(target, i):
+  """Overload of get_item that executes a Python list modification."""
+  return target[i]
+
+
+def set_item(target, i, x):
+  """The slice write operator (i.e. __setitem__).
+
+  Note: it is unspecified whether target will be mutated or not. In general,
+  if target is mutable (like Python lists), it will be mutated.
+
+  Args:
+    target: An entity that supports setitem semantics.
+    i: Index to modify.
+    x: The new element value.
+
+  Returns:
+    Same as target, after the update was performed.
+
+  Raises:
+    ValueError: if target is not of a supported type.
+  """
+  if isinstance(target, tensor_array_ops.TensorArray):
+    return _tf_tensorarray_set_item(target, i, x)
+  elif tensor_util.is_tensor(target):
+    if target.dtype == dtypes.variant:
+      return _tf_tensor_list_set_item(target, i, x)
+    else:
+      raise ValueError(
+          'tensor lists are expected to be Tensors with dtype=tf.variant,'
+          ' instead found %s' % target)
+  else:
+    return _py_set_item(target, i, x)
+
+
+def _tf_tensorarray_set_item(target, i, x):
+  """Overload of set_item that stages a TensorArray write."""
+  return target.write(i, x)
+
+
+def _tf_tensor_list_set_item(target, i, x):
+  """Overload of set_item that stages a Tensor list update."""
+  return list_ops.tensor_list_set_item(target, i, x)
+
+
+def _py_set_item(target, i, x):
+  """Overload of set_item that executes a Python list modification."""
+  target[i] = x
+  return target
diff --git a/tensorflow/contrib/autograph/operators/slices_test.py b/tensorflow/contrib/autograph/operators/slices_test.py
new file mode 100644
index 0000000000..d4aacb9d20
--- /dev/null
+++ b/tensorflow/contrib/autograph/operators/slices_test.py
@@ -0,0 +1,51 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slices module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.operators import slices
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import list_ops
+from tensorflow.python.platform import test
+
+
+class SlicesTest(test.TestCase):
+
+  def test_set_item_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+    l = slices.set_item(l, 0, [5, 6])
+
+    with self.test_session() as sess:
+      t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
+      self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
+
+  def test_get_item_tensor_list(self):
+    initial_list = constant_op.constant([[1, 2], [3, 4]])
+    elem_shape = constant_op.constant([2])
+    l = list_ops.tensor_list_from_tensor(initial_list, element_shape=elem_shape)
+    t = slices.get_item(
+        l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
+
+    with self.test_session() as sess:
+      self.assertAllEqual(sess.run(t), [3, 4])
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 38a2a66fa996e20fabfabd4d07505c2daef7ef95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 10:39:33 -0700
Subject: [PATCH 105/610] [XLA] Redesign: delete computation_tracker and
 user_computation.

PiperOrigin-RevId: 198743117
---
 tensorflow/compiler/xla/service/BUILD         |   67 -
 .../xla/service/buffer_assignment_test.cc     |    6 +-
 .../compiler/xla/service/channel_tracker.h    |    1 -
 .../xla/service/compile_only_service.cc       |    1 -
 .../xla/service/computation_tracker.cc        |  256 --
 .../xla/service/computation_tracker.h         |  147 -
 .../compiler/xla/service/local_service.cc     |    2 -
 tensorflow/compiler/xla/service/service.cc    |  192 +-
 tensorflow/compiler/xla/service/service.h     |   47 +-
 .../compiler/xla/service/user_computation.cc  | 3557 -----------------
 .../compiler/xla/service/user_computation.h   |  413 --
 .../xla/service/user_computation_test.cc      |  340 --
 tensorflow/compiler/xla/tools/BUILD           |    1 -
 .../xla/tools/dumped_computation_to_text.cc   |    1 -
 14 files changed, 10 insertions(+), 5021 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/computation_tracker.cc
 delete mode 100644 tensorflow/compiler/xla/service/computation_tracker.h
 delete mode 100644 tensorflow/compiler/xla/service/user_computation.cc
 delete mode 100644 tensorflow/compiler/xla/service/user_computation.h
 delete mode 100644 tensorflow/compiler/xla/service/user_computation_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index cd3d55e4f9..b954bbd20a 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -547,45 +547,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "user_computation",
-    srcs = ["user_computation.cc"],
-    hdrs = ["user_computation.h"],
-    deps = [
-        ":hlo",
-        ":session_proto",
-        ":shape_inference",
-        ":versioned_computation_handle",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla:xla_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
-tf_cc_test(
-    name = "user_computation_test",
-    srcs = ["user_computation_test.cc"],
-    deps = [
-        ":hlo_matchers",
-        ":user_computation",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "platform_util",
     srcs = ["platform_util.cc"],
@@ -634,7 +595,6 @@ cc_library(
         ":compilation_cache",
         ":compiler",
         ":computation_layout",
-        ":computation_tracker",
         ":device_memory_allocator",
         ":executable",
         ":execution_tracker",
@@ -648,7 +608,6 @@ cc_library(
         ":session_proto",
         ":source_map_util",
         ":transfer_manager",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
@@ -676,7 +635,6 @@ cc_library(
         ":backend",
         ":compiler",
         ":computation_layout",
-        ":computation_tracker",
         ":device_memory_allocator",
         ":executable",
         ":hlo",
@@ -685,7 +643,6 @@ cc_library(
         ":platform_util",
         ":service",
         ":shaped_buffer",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:shape_layout",
@@ -710,7 +667,6 @@ cc_library(
         ":backend",
         ":compiler",
         ":computation_layout",
-        ":computation_tracker",
         ":platform_util",
         ":service",
         "//tensorflow/compiler/xla:status_macros",
@@ -905,25 +861,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "computation_tracker",
-    srcs = ["computation_tracker.cc"],
-    hdrs = ["computation_tracker.h"],
-    deps = [
-        ":hlo",
-        ":hlo_module_config",
-        ":session_proto",
-        ":user_computation",
-        ":versioned_computation_handle",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "channel_tracker",
     srcs = ["channel_tracker.cc"],
@@ -931,7 +868,6 @@ cc_library(
     deps = [
         ":hlo",
         ":session_proto",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -1038,7 +974,6 @@ tf_cc_test(
         ":buffer_assignment",
         ":buffer_value",
         ":call_graph",
-        ":computation_tracker",
         ":copy_insertion",
         ":cpu_plugin",
         ":flatten_call_graph",
@@ -1710,13 +1645,11 @@ tf_cc_test(
     name = "hlo_cost_analysis_test",
     srcs = ["hlo_cost_analysis_test.cc"],
     deps = [
-        ":computation_tracker",
         ":cpu_plugin",
         ":hlo",
         ":hlo_cost_analysis",
         ":local_service",
         ":service",
-        ":user_computation",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index a4fb0eefac..bdcea92882 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_value.h"
 #include "tensorflow/compiler/xla/service/call_graph.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
@@ -82,7 +81,7 @@ const std::vector<const HloInstruction*> GetInstructions(HloInstruction* root) {
 
 class BufferAssignmentTest : public HloTestBase {
  protected:
-  BufferAssignmentTest() : computation_tracker_() {}
+  BufferAssignmentTest() {}
   ~BufferAssignmentTest() override {}
 
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
@@ -252,9 +251,6 @@ class BufferAssignmentTest : public HloTestBase {
     return total_size;
   }
 
-  // Computation tracker for nested computations.
-  ComputationTracker computation_tracker_;
-
   // Shapes for use in the examples.
   Shape s32_ = ShapeUtil::MakeShape(xla::S32, {});
   Shape r0f32_ = ShapeUtil::MakeShape(xla::F32, {});
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index c7763f2ca3..e415fb27e6 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index c2e698a49f..d8fdccf9bb 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/computation_tracker.cc b/tensorflow/compiler/xla/service/computation_tracker.cc
deleted file mode 100644
index 70e25eebdb..0000000000
--- a/tensorflow/compiler/xla/service/computation_tracker.cc
+++ /dev/null
@@ -1,256 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
-
-#include <list>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-
-using ::tensorflow::strings::Appendf;
-
-namespace xla {
-
-ComputationTracker::ComputationTracker() : next_computation_(1) {}
-
-ComputationHandle ComputationTracker::NewComputation(
-    const string& computation_name) {
-  tensorflow::mutex_lock lock(computation_mutex_);
-  ComputationHandle computation_handle;
-  int64 handle_value = next_computation_++;
-  computation_handle.set_handle(handle_value);
-  opaque_to_computation_[handle_value] =
-      MakeUnique<UserComputation>(computation_name, computation_handle);
-  return computation_handle;
-}
-
-StatusOr<ComputationHandle> ComputationTracker::LoadSessionModule(
-    const SessionModule& session_module) {
-  tensorflow::mutex_lock lock(computation_mutex_);
-
-  // For each embedded computation, create a new computation based on its
-  // serialized data, and place the mapping from the old computation handle to
-  // the new computation handle.
-
-  // Build a mapping from old embedded computation handles to new computation
-  // handles. We build the ID mapping first since the embedded computations are
-  // in no particular order and may refer to each other.
-  std::map<int64, ComputationHandle> old_to_new;
-  for (const SessionComputation& computation :
-       session_module.embedded_computations()) {
-    const int64 old_handle = computation.computation_handle().handle();
-    if (!old_to_new.emplace(old_handle, AllocateHandle()).second) {
-      return InvalidArgument("Duplicate embedded computation handle %lld",
-                             old_handle);
-    }
-  }
-
-  // Create a new computation from each serialized embedded computation.
-  for (const SessionComputation& computation :
-       session_module.embedded_computations()) {
-    const int64 old_handle = computation.computation_handle().handle();
-    const ComputationHandle& new_handle = old_to_new[old_handle];
-    TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()],
-                        UserComputation::MakeWithRemapping(
-                            computation, new_handle, old_to_new));
-  }
-
-  // Finally, place the entry computation in the tracker with all of the
-  // remappings populated from the above.
-  const int64 old_handle = session_module.entry().computation_handle().handle();
-  TF_ASSIGN_OR_RETURN(
-      old_to_new[old_handle],
-      LoadSessionComputation(session_module.entry(), &old_to_new));
-  return old_to_new[old_handle];
-}
-
-StatusOr<std::unique_ptr<SessionModule>>
-ComputationTracker::SnapshotComputation(const ComputationHandle& computation) {
-  TF_ASSIGN_OR_RETURN(UserComputation * user_computation, Resolve(computation));
-  const VersionedComputationHandle entry_versioned_handle =
-      user_computation->GetVersionedHandle();
-  std::set<VersionedComputationHandle> visited;
-  std::list<VersionedComputationHandle> post_order;
-  {
-    tensorflow::mutex_lock lock(computation_mutex_);
-    ComputeComputationPostOrder(entry_versioned_handle, &visited, &post_order);
-  }
-  auto session_module = MakeUnique<SessionModule>();
-  *session_module->mutable_entry() =
-      Resolve(entry_versioned_handle.handle)
-          .ValueOrDie()
-          ->CloneSessionComputation(entry_versioned_handle.version);
-  for (auto it = ++post_order.rbegin(); it != post_order.rend(); ++it) {
-    *session_module->add_embedded_computations() =
-        Resolve(it->handle).ValueOrDie()->CloneSessionComputation(it->version);
-  }
-  return std::move(session_module);
-}
-
-StatusOr<UserComputation*> ComputationTracker::Resolve(
-    const ComputationHandle& computation) const {
-  tensorflow::mutex_lock lock(computation_mutex_);
-  return ResolveInternal(computation);
-}
-
-ComputationHandle ComputationTracker::AllocateHandle() {
-  int64 handle_value = next_computation_++;
-  ComputationHandle result;
-  result.set_handle(handle_value);
-  return result;
-}
-
-StatusOr<ComputationHandle> ComputationTracker::LoadSessionComputation(
-    const SessionComputation& session_computation,
-    std::map<int64, ComputationHandle>* old_to_new) {
-  TF_RET_CHECK(old_to_new != nullptr);
-  const ComputationHandle new_handle = AllocateHandle();
-  (*old_to_new)[session_computation.computation_handle().handle()] = new_handle;
-  TF_ASSIGN_OR_RETURN(opaque_to_computation_[new_handle.handle()],
-                      UserComputation::MakeWithRemapping(
-                          session_computation, new_handle, *old_to_new));
-  return new_handle;
-}
-
-StatusOr<UserComputation*> ComputationTracker::ResolveInternal(
-    const ComputationHandle& computation) const {
-  auto it = opaque_to_computation_.find(computation.handle());
-  if (it == opaque_to_computation_.end()) {
-    return NotFound("computation handle not found: %lld", computation.handle());
-  }
-  UserComputation* user_computation = it->second.get();
-  return user_computation;
-}
-
-void ComputationTracker::ComputeComputationPostOrder(
-    const VersionedComputationHandle& versioned_handle,
-    std::set<VersionedComputationHandle>* visited,
-    std::list<VersionedComputationHandle>* post_order) const {
-  if (visited->count(versioned_handle) > 0) {
-    CHECK_EQ(1, visited->count(versioned_handle));
-    return;
-  }
-
-  UserComputation* computation =
-      ResolveInternal(versioned_handle.handle).ValueOrDie();
-  std::vector<VersionedComputationHandle> embedded_handles =
-      computation->GetEmbeddedComputations(versioned_handle.version);
-
-  for (const auto& embedded_handle : embedded_handles) {
-    ComputeComputationPostOrder(embedded_handle, visited, post_order);
-  }
-
-  visited->insert(versioned_handle);
-  post_order->push_back(versioned_handle);
-}
-
-StatusOr<std::unique_ptr<HloModule>> ComputationTracker::BuildHloModule(
-    const VersionedComputationHandle& entry_handle,
-    const HloModuleConfig& config,
-    bool include_unreachable_instructions) const {
-  tensorflow::mutex_lock lock(computation_mutex_);
-
-  VLOG(1) << "BuildHloModule(" << entry_handle
-          << ", include_unreachable_instructions="
-          << include_unreachable_instructions << ")";
-  XLA_VLOG_LINES(1, ToStringInternal());
-
-  TF_ASSIGN_OR_RETURN(UserComputation * entry_computation,
-                      ResolveInternal(entry_handle.handle));
-
-  // Build a topological sort of the entry and any embedded computations as a
-  // list. The root of the computation will be the last element in the list.
-  std::set<VersionedComputationHandle> visited;
-  std::list<VersionedComputationHandle> post_order;
-  ComputeComputationPostOrder(entry_handle, &visited, &post_order);
-
-  // Map from ComputationHandle value and computation version to HloComputation.
-  std::map<VersionedComputationHandle, HloComputation*> hlo_computations;
-
-  // The resolver lambda resolves VersionedHandles to embedded
-  // HloComputation*. This is required by UserComputation::BuildHloComputation
-  // when lowering calling operations (map, reduce etc).
-  auto resolver = [&hlo_computations](
-      const VersionedComputationHandle& versioned_handle) -> HloComputation* {
-    CHECK_GT(hlo_computations.count(versioned_handle), 0);
-    return hlo_computations.at(versioned_handle);
-  };
-
-  // Print the post-order list for this entry computation.
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "Visiting UserComputations in post order:";
-    for (const VersionedComputationHandle& versioned_handle : post_order) {
-      VLOG(2) << "  " << versioned_handle;
-    }
-  }
-
-  string module_name =
-      tensorflow::strings::StrCat(entry_computation->name(), "_module");
-  auto module = MakeUnique<HloModule>(module_name, entry_handle, config);
-  for (auto versioned_handle : post_order) {
-    UserComputation* computation =
-        ResolveInternal(versioned_handle.handle).ValueOrDie();
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation->BuildHloComputation(versioned_handle.version, resolver,
-                                         config.debug_options(),
-                                         include_unreachable_instructions));
-
-    // Add the newly created computation to VersionedHandle-to-HloComputation
-    // map.
-    DCHECK_EQ(0, hlo_computations.count(versioned_handle));
-    hlo_computations[versioned_handle] = hlo_computation.get();
-
-    if (computation == entry_computation) {
-      module->AddEntryComputation(std::move(hlo_computation));
-    } else {
-      module->AddEmbeddedComputation(std::move(hlo_computation));
-    }
-  }
-
-  return std::move(module);
-}
-
-string ComputationTracker::ToString() const {
-  tensorflow::mutex_lock lock(computation_mutex_);
-  return ToStringInternal();
-}
-
-string ComputationTracker::ToStringInternal() const {
-  string out;
-  Appendf(&out, "ComputationTracker(%p):\n", this);
-  for (const auto& handle_computation : opaque_to_computation_) {
-    int64 handle = handle_computation.first;
-    const std::unique_ptr<UserComputation>& computation =
-        handle_computation.second;
-    Appendf(&out, "  %4lld : %s \"%s\"\n", handle,
-            computation->GetVersionedHandle().ToString().c_str(),
-            computation->name().c_str());
-  }
-  return out;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_tracker.h b/tensorflow/compiler/xla/service/computation_tracker.h
deleted file mode 100644
index d42d66adef..0000000000
--- a/tensorflow/compiler/xla/service/computation_tracker.h
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_
-
-#include <list>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-
-#include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// Tracks computations for the XLA service; computations can be registered
-// with a UserComputation instance and can be resolved from a handle for later
-// use.
-//
-// This class is also capable of serializing/deserializing computations that it
-// tracks (and to serialize properly you need to serialize all referred-to
-// computations as well).
-class ComputationTracker {
- public:
-  ComputationTracker();
-
-  // Creates a new UserComputation object and returns the corresponding
-  // ComputationHandle for it.
-  //
-  // Precondition: user_computation is not already present in the map.
-  ComputationHandle NewComputation(const string& computation_name);
-
-  // Restores session data for a computation that has been serialized, and
-  // allocates a new computation handle for it.
-  StatusOr<ComputationHandle> LoadSessionModule(
-      const SessionModule& session_module);
-
-  // Snapshots a computation (referenced by the provided handle) at its latest
-  // version, returning a module where it is the entry, and any referred-to
-  // computations are entrained as "embedded" (non-entry) computations.
-  StatusOr<std::unique_ptr<SessionModule>> SnapshotComputation(
-      const ComputationHandle& computation);
-
-  // Resolves a ComputationHandle to a UserComputation that is present in the
-  // map.
-  StatusOr<UserComputation*> Resolve(
-      const ComputationHandle& computation) const;
-
-  // Builds an HLO module using the specified computation as the entry. The
-  // module will include the entry computation as well as all computations which
-  // are called directly or indirectly from the entry computation via operations
-  // like "map". config is the HLO module configuration to use for the
-  // constructed module.
-  // If include_unreachable_instructions is true, then instructions
-  // which are not reachable from the root are lowered into HloInstructions
-  // including unreachable parameters. This ensures the entry HloComputation has
-  // the same program shape (ProgramShape) as the entry UserComputation.
-  StatusOr<std::unique_ptr<HloModule>> BuildHloModule(
-      const VersionedComputationHandle& entry_handle,
-      const HloModuleConfig& config,
-      bool include_unreachable_instructions = true) const;
-
-  string ToString() const;
-
- private:
-  // Bumps the next_computation_ number and returns the allocated number wrapped
-  // in a ComputationHandle.
-  ComputationHandle AllocateHandle()
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Loads a session computation into a UserComputation, registers it, and
-  // returns the computation handle of the registered computation. If old_to_new
-  // is provided, it is used for remapping references to computations present in
-  // session_computation.
-  //
-  // old_to_new will be updated with the mapping from session_computation's old
-  // handle to the returned handle value, and may not be null.
-  StatusOr<ComputationHandle> LoadSessionComputation(
-      const SessionComputation& session_computation,
-      std::map<int64, ComputationHandle>* old_to_new)
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Internal implementation of Resolve method which requires, but does not
-  // acquire the mutex.
-  StatusOr<UserComputation*> ResolveInternal(
-      const ComputationHandle& computation) const
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Builds a post order sort of a computation ("entry") and all of its embedded
-  // computations including all transitively embedded computations. An embedded
-  // computation (the callee) will always appear in the sort before the
-  // computation which calls the embedded computation (the caller). Necessarily,
-  // the entry computation is the last element in the sort. visited and
-  // post_order should be empty when calling. post_order contains the post order
-  // sort when the function return.
-  void ComputeComputationPostOrder(
-      const VersionedComputationHandle& versioned_handle,
-      std::set<VersionedComputationHandle>* visited,
-      std::list<VersionedComputationHandle>* post_order) const
-      EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  string ToStringInternal() const EXCLUSIVE_LOCKS_REQUIRED(computation_mutex_);
-
-  // Guards the computation mapping. Marked mutable so that the Resolve method
-  // can remain const; Resolve does't really modify the tracker in any way, but
-  // it has to lock the mutex for safety.
-  mutable tensorflow::mutex computation_mutex_;
-
-  // The next sequence number to assign to a computation, guarded by the same
-  // mutex as the mapping as they'll be mutated at the same time.
-  int64 next_computation_ GUARDED_BY(computation_mutex_);
-
-  // Mapping from ComputationHandle value to the corresponding registered
-  // UserComputation object.
-  std::map<int64, std::unique_ptr<UserComputation>> opaque_to_computation_
-      GUARDED_BY(computation_mutex_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ComputationTracker);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPUTATION_TRACKER_H_
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 968db7c76e..375c4a6780 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -24,14 +24,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 79c098accb..82be6bcf4f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -274,8 +274,7 @@ Service::ResolveAndValidateArguments(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-    const ExecutionOptions* execution_options,
-    const UserComputation* user_computation) {
+    const ExecutionOptions* execution_options) {
   auto config = MakeUnique<HloModuleConfig>(program_shape);
   ComputationLayout* host_computation_layout =
       config->mutable_host_entry_computation_layout();
@@ -291,17 +290,9 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     // ProgramShape.
     if (!ShapeUtil::Compatible(*argument_shapes[i],
                                program_shape.parameters(i))) {
-      if (user_computation == nullptr) {
-        return InvalidArgument(
-            "Argument does not match shape of computation parameter %d: want "
-            "%s, got %s",
-            i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
-            ShapeUtil::HumanString(*argument_shapes[i]).c_str());
-      }
-      return InvalidParameterArgument(
-          *user_computation->ParameterMetadata(i).value(),
-          "Argument does not match shape of computation parameter %d: want %s, "
-          "got %s",
+      return InvalidArgument(
+          "Argument does not match shape of computation parameter %d: want "
+          "%s, got %s",
           i, ShapeUtil::HumanString(program_shape.parameters(i)).c_str(),
           ShapeUtil::HumanString(*argument_shapes[i]).c_str());
     }
@@ -352,76 +343,12 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
 StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
     const ProgramShape& program_shape,
     tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    const ExecutionOptions& execution_options,
-    const UserComputation* user_computation) {
+    const ExecutionOptions& execution_options) {
   std::vector<const Shape*> argument_shapes;
   for (const auto* arg : arguments) {
     argument_shapes.push_back(&arg->on_host_shape());
   }
-  return CreateModuleConfig(program_shape, argument_shapes, &execution_options,
-                            user_computation);
-}
-
-StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
-    std::vector<VersionedComputationHandle> versioned_handles,
-    std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-    Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-    DeviceMemoryAllocator* device_allocator) {
-  VLOG(1) << Printf("BuildExecutable on service %p", this);
-
-  // Dump computation proto state if flag is set.
-  std::vector<std::unique_ptr<SessionModule>> session_modules;
-  for (int64 i = 0; i < versioned_handles.size(); ++i) {
-    const string& directory_path =
-        module_configs[i]->debug_options().xla_dump_computations_to();
-    const string& other_directory_path =
-        module_configs[i]->debug_options().xla_dump_executions_to();
-    if (directory_path.empty() && other_directory_path.empty()) {
-      continue;
-    }
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<SessionModule> session_module,
-        computation_tracker_.SnapshotComputation(versioned_handles[i].handle));
-    if (!directory_path.empty()) {
-      string filename = Printf("computation_%lld__%s__version_%lld",
-                               versioned_handles[i].handle.handle(),
-                               session_module->entry().name().c_str(),
-                               versioned_handles[i].version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
-      session_modules.push_back(std::move(session_module));
-    }
-  }
-
-  VLOG(1) << "Computation handles:";
-  for (const VersionedComputationHandle& versioned_handle : versioned_handles) {
-    VLOG(1) << versioned_handle;
-  }
-
-  CHECK_EQ(versioned_handles.size(), module_configs.size());
-  std::vector<std::unique_ptr<HloModule>> modules;
-  for (int64 i = 0; i < versioned_handles.size(); ++i) {
-    const VersionedComputationHandle& versioned_handle = versioned_handles[i];
-    const HloModuleConfig& config = *module_configs[i];
-    TF_ASSIGN_OR_RETURN(auto module,
-                        computation_tracker_.BuildHloModule(
-                            versioned_handle, config,
-                            /*include_unreachable_instructions=*/true));
-    modules.push_back(std::move(module));
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::unique_ptr<Executable>> executables,
-      backend->compiler()->Compile(std::move(modules), std::move(executors),
-                                   device_allocator));
-
-  for (size_t i = 0; i < versioned_handles.size(); ++i) {
-    if (!module_configs[i]->debug_options().xla_dump_executions_to().empty()) {
-      executables[i]->set_session_module(std::move(session_modules[i]));
-    }
-  }
-
-  return std::move(executables);
+  return CreateModuleConfig(program_shape, argument_shapes, &execution_options);
 }
 
 StatusOr<std::vector<std::unique_ptr<Executable>>> Service::BuildExecutables(
@@ -498,98 +425,6 @@ Status Service::ValidateEntryComputationLayout(HloModule* module) {
   return Status::OK();
 }
 
-StatusOr<std::unique_ptr<Executable>> Service::BuildExecutable(
-    const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, DeviceMemoryAllocator* device_allocator) {
-  VLOG(1) << Printf("BuildExecutable on service %p with handle %s", this,
-                    versioned_handle.ToString().c_str());
-
-  // Dump computation proto state if flag is set.
-  std::unique_ptr<SessionModule> session_module;
-  const string& directory_path =
-      module_config->debug_options().xla_dump_computations_to();
-  const string& other_directory_path =
-      module_config->debug_options().xla_dump_executions_to();
-  if (!directory_path.empty() || !other_directory_path.empty()) {
-    TF_ASSIGN_OR_RETURN(
-        session_module,
-        computation_tracker_.SnapshotComputation(versioned_handle.handle));
-    if (!directory_path.empty()) {
-      string filename = Printf("computation_%lld__%s__version_%lld",
-                               versioned_handle.handle.handle(),
-                               session_module->entry().name().c_str(),
-                               versioned_handle.version);
-      TF_RETURN_IF_ERROR(Executable::DumpToDirectory(directory_path, filename,
-                                                     *session_module));
-    }
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> module,
-      computation_tracker_.BuildHloModule(versioned_handle, *module_config,
-                                          /*include_unreachable_instructions=*/
-                                          true));
-
-  TF_RETURN_IF_ERROR(MaybeDumpHloModule(*module));
-
-  TF_ASSIGN_OR_RETURN(
-      module, backend->compiler()->RunHloPasses(std::move(module), executor,
-                                                device_allocator));
-  // Check that on-host and on-device shapes are consistent.
-  TF_RETURN_IF_ERROR(ValidateEntryComputationLayout(module.get()));
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> executable,
-                      backend->compiler()->RunBackend(
-                          std::move(module), executor, device_allocator));
-
-  if (!other_directory_path.empty()) {
-    executable->set_session_module(std::move(session_module));
-  }
-
-  return std::move(executable);
-}
-
-StatusOr<std::shared_ptr<Executable>> Service::BuildAndCacheExecutable(
-    const VersionedComputationHandle& versioned_handle,
-    std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-    se::StreamExecutor* executor, ExecutionProfile* profile,
-    DeviceMemoryAllocator* device_allocator) {
-  std::shared_ptr<Executable> executable =
-      compilation_cache_.LookUp(versioned_handle, *module_config);
-
-  if (executable != nullptr) {
-    // Executable found in the computation cache.
-    if (profile != nullptr) {
-      profile->set_compilation_cache_hit(true);
-    }
-    return executable;
-  }
-
-  uint64 start_micros =
-      // Avoid reading the clock if we don't want timing info
-      (profile != nullptr) ? tensorflow::Env::Default()->NowMicros() : 0;
-
-  // Take a copy of the module config, as compilation introduces layouts where
-  // layouts were optional before.
-  HloModuleConfig original_module_config = *module_config;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Executable> executable_unique_ptr,
-      BuildExecutable(versioned_handle, std::move(module_config), backend,
-                      executor, device_allocator));
-
-  if (profile != nullptr) {
-    uint64 end_micros = tensorflow::Env::Default()->NowMicros();
-    uint64 milliseconds = (end_micros - start_micros) / 1000;
-    profile->set_compilation_cache_hit(false);
-    profile->set_compile_time_ms(milliseconds);
-  }
-
-  // Insert executable into the cache.
-  return compilation_cache_.Insert(std::move(executable_unique_ptr),
-                                   original_module_config);
-}
-
 StatusOr<std::vector<GlobalDataHandle>>
 Service::ExecuteParallelAndRegisterResult(
     tensorflow::gtl::ArraySlice<Executable*> executables,
@@ -882,8 +717,7 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
         std::unique_ptr<HloModuleConfig> module_config,
         CreateModuleConfig(request.computation().program_shape(),
                            replicated_arguments.front(),
-                           request.execution_options(),
-                           /*user_computation=*/nullptr));
+                           request.execution_options()));
     VLOG(3)
         << "ExecuteGraphParallel created HloModuleConfig computation layout: "
         << module_config->host_entry_computation_layout().ToString();
@@ -1340,18 +1174,6 @@ Status Service::GetComputationGraphStats(
   return Status::OK();
 }
 
-template <typename RequestT, typename ResponseT>
-Status Service::AddInstruction(
-    const RequestT* arg, ResponseT* result,
-    const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
-        adder) {
-  TF_ASSIGN_OR_RETURN(UserComputation * computation,
-                      computation_tracker_.Resolve(arg->computation()));
-
-  TF_ASSIGN_OR_RETURN(*result->mutable_output(), adder(computation));
-  return Status::OK();
-}
-
 DeviceHandle Service::SingleComputationDeviceHandle() const {
   DeviceHandle device_handle;
   device_handle.set_handle(0);
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index b3c0eac9da..422bb95657 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
 #include "tensorflow/compiler/xla/service/compilation_cache.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
@@ -35,7 +34,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/user_computation.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -172,12 +170,6 @@ class Service : public ServiceInterface {
   Status CreateChannelHandle(const CreateChannelHandleRequest* arg,
                              CreateChannelHandleResponse* result) override;
 
-  // Returns the ComputationTracker of the current service instance.
-  // Only used in unit tests to access user computations from client.
-  const ComputationTracker& computation_tracker() {
-    return computation_tracker_;
-  }
-
   // Returns the backend used to execute computations.
   const Backend& backend() const { return *execute_backend_; }
   Backend* mutable_backend() { return execute_backend_.get(); }
@@ -188,8 +180,7 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      const ExecutionOptions& execution_options,
-      const UserComputation* user_computation = nullptr);
+      const ExecutionOptions& execution_options);
 
   // Picks a parallel response and fills the result.
   Status PickParallelResponse(const ExecuteParallelResponse& parallel_result,
@@ -230,23 +221,13 @@ class Service : public ServiceInterface {
   StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
       const ProgramShape& program_shape,
       tensorflow::gtl::ArraySlice<const Shape*> argument_shapes,
-      const ExecutionOptions* execution_options,
-      const UserComputation* user_computation = nullptr);
+      const ExecutionOptions* execution_options);
 
   // Builds an Executable for the given parameters.
   //
   // If device_allocator is not null, the compiler may use it to allocate temp
   // buffers, which the compiler is responsible for freeing.  The allocator
   // given here need not match the allocator used when running the executable.
-  StatusOr<std::unique_ptr<Executable>> BuildExecutable(
-      const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      se::StreamExecutor* executor,
-      DeviceMemoryAllocator* device_allocator = nullptr);
-
-  // Builds an Executable for the given HLO module proto.
-  //
-  // TODO(b/74197823): This is a part of a NOT YET ready refactor.
   StatusOr<std::unique_ptr<Executable>> BuildExecutable(
       const HloModuleProto& module_proto,
       std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
@@ -255,26 +236,12 @@ class Service : public ServiceInterface {
 
   // Same as BuildExecutable() above, but builds a list of Executables for the
   // given computations that may interact with each other.
-  StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
-      std::vector<VersionedComputationHandle> versioned_handles,
-      std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
-      Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
-      DeviceMemoryAllocator* device_allocator);
   StatusOr<std::vector<std::unique_ptr<Executable>>> BuildExecutables(
       const std::vector<const HloModuleProto*>& module_protos,
       std::vector<std::unique_ptr<HloModuleConfig>> module_configs,
       Backend* backend, std::vector<std::vector<se::StreamExecutor*>> executors,
       DeviceMemoryAllocator* device_allocator);
 
-  // Similar to BuildExecutable, but look in the compilation cache for the
-  // executable first. If the executable is not in the cache, it is built and
-  // inserted into the cache.
-  StatusOr<std::shared_ptr<Executable>> BuildAndCacheExecutable(
-      const VersionedComputationHandle& versioned_handle,
-      std::unique_ptr<HloModuleConfig> module_config, Backend* backend,
-      se::StreamExecutor* executor, ExecutionProfile* profile,
-      DeviceMemoryAllocator* device_allocator = nullptr);
-
   // Runs the given executable with the given arguments and register the result
   // in the allocation tracker. The handle of the result from the tracker is
   // returned. If the parameter "profile" is not null, it points to an
@@ -297,13 +264,6 @@ class Service : public ServiceInterface {
       tensorflow::gtl::ArraySlice<string> result_tags,
       ExecutionProfile* profile);
 
-  // Convenience function for adding a function to a user computation.
-  template <typename RequestT, typename ResponseT>
-  Status AddInstruction(
-      const RequestT* arg, ResponseT* result,
-      const std::function<StatusOr<ComputationDataHandle>(UserComputation*)>&
-          adder);
-
   // Executes a single computation which has more than one target device.
   // The N devices are expected to all return an empty tuple, but one, which
   // will be the result of this computation.
@@ -329,9 +289,6 @@ class Service : public ServiceInterface {
 
   ServiceOptions options_;
 
-  // Tracks computations built via the API.
-  ComputationTracker computation_tracker_;
-
   // Tracks channels created via the API.
   ChannelTracker channel_tracker_;
 
diff --git a/tensorflow/compiler/xla/service/user_computation.cc b/tensorflow/compiler/xla/service/user_computation.cc
deleted file mode 100644
index 9e62d0acfb..0000000000
--- a/tensorflow/compiler/xla/service/user_computation.cc
+++ /dev/null
@@ -1,3557 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/user_computation.h"
-
-#include <algorithm>
-#include <set>
-#include <stack>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/ptr_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/service/shape_inference.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/lib/strings/stringprintf.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/protobuf.h"
-
-namespace xla {
-namespace {
-
-HloOpcode UnaryOperationToHloOpcode(UnaryOperation unop) {
-  switch (unop) {
-    case UNOP_ABS:
-      return HloOpcode::kAbs;
-    case UNOP_CEIL:
-      return HloOpcode::kCeil;
-    case UNOP_CLZ:
-      return HloOpcode::kClz;
-    case UNOP_COS:
-      return HloOpcode::kCos;
-    case UNOP_EXP:
-      return HloOpcode::kExp;
-    case UNOP_EXPM1:
-      return HloOpcode::kExpm1;
-    case UNOP_FLOOR:
-      return HloOpcode::kFloor;
-    case UNOP_IMAG:
-      return HloOpcode::kImag;
-    case UNOP_IS_FINITE:
-      return HloOpcode::kIsFinite;
-    case UNOP_LOG:
-      return HloOpcode::kLog;
-    case UNOP_LOG1P:
-      return HloOpcode::kLog1p;
-    case UNOP_NOT:
-      return HloOpcode::kNot;
-    case UNOP_NEGATE:
-      return HloOpcode::kNegate;
-    case UNOP_REAL:
-      return HloOpcode::kReal;
-    case UNOP_ROUND_NEAREST_AFZ:
-      return HloOpcode::kRoundNearestAfz;
-    case UNOP_SIGN:
-      return HloOpcode::kSign;
-    case UNOP_SIN:
-      return HloOpcode::kSin;
-    case UNOP_SORT:
-      return HloOpcode::kSort;
-    case UNOP_TANH:
-      return HloOpcode::kTanh;
-    default:
-      LOG(FATAL) << "unhandled operation " << unop;
-  }
-}
-
-HloOpcode BinaryOperationToHloOpcode(BinaryOperation binop) {
-  switch (binop) {
-    case BINOP_ATAN2:
-      return HloOpcode::kAtan2;
-    case BINOP_COMPLEX:
-      return HloOpcode::kComplex;
-    case BINOP_MUL:
-      return HloOpcode::kMultiply;
-    case BINOP_ADD:
-      return HloOpcode::kAdd;
-    case BINOP_SUB:
-      return HloOpcode::kSubtract;
-    case BINOP_DIV:
-      return HloOpcode::kDivide;
-    case BINOP_EQ:
-      return HloOpcode::kEq;
-    case BINOP_GE:
-      return HloOpcode::kGe;
-    case BINOP_GT:
-      return HloOpcode::kGt;
-    case BINOP_LE:
-      return HloOpcode::kLe;
-    case BINOP_LT:
-      return HloOpcode::kLt;
-    case BINOP_NE:
-      return HloOpcode::kNe;
-    case BINOP_MAX:
-      return HloOpcode::kMaximum;
-    case BINOP_MIN:
-      return HloOpcode::kMinimum;
-    case BINOP_POW:
-      return HloOpcode::kPower;
-    case BINOP_REM:
-      return HloOpcode::kRemainder;
-    case BINOP_OR:
-      return HloOpcode::kOr;
-    case BINOP_AND:
-      return HloOpcode::kAnd;
-    case BINOP_SHIFT_LEFT:
-      return HloOpcode::kShiftLeft;
-    case BINOP_SHIFT_RIGHT_ARITHMETIC:
-      return HloOpcode::kShiftRightArithmetic;
-    case BINOP_SHIFT_RIGHT_LOGICAL:
-      return HloOpcode::kShiftRightLogical;
-    default:
-      LOG(FATAL) << "unhandled operation " << binop;
-  }
-}
-
-HloOpcode TernaryOperationToHloOpcode(TernaryOperation triop) {
-  switch (triop) {
-    case TRIOP_CLAMP:
-      return HloOpcode::kClamp;
-    case TRIOP_SELECT:
-      return HloOpcode::kSelect;
-    default:
-      LOG(FATAL) << "unhandled operation " << triop;
-  }
-}
-
-HloOpcode VariadicOperationToHloOpcode(VariadicOperation varop) {
-  switch (varop) {
-    case VAROP_TUPLE:
-      return HloOpcode::kTuple;
-    default:
-      LOG(FATAL) << "unhandled operation " << varop;
-  }
-}
-
-}  // namespace
-
-/* static */ StatusOr<std::unique_ptr<UserComputation>>
-UserComputation::MakeWithRemapping(
-    const SessionComputation& session_computation,
-    const ComputationHandle& handle,
-    const std::map<int64, ComputationHandle>& old_to_new) {
-  auto user_computation =
-      MakeUnique<UserComputation>(session_computation.name(), handle);
-  {
-    tensorflow::mutex_lock lock(user_computation->mutex_);
-    user_computation->session_computation_ = session_computation;
-    user_computation->next_handle_value_ =
-        std::max_element(session_computation.requests().begin(),
-                         session_computation.requests().end(),
-                         [](const std::pair<int64, OperationRequest>& lhs,
-                            const std::pair<int64, OperationRequest>& rhs) {
-                           return lhs.first < rhs.first;
-                         })
-            ->first +
-        1;
-    TF_RETURN_IF_ERROR(user_computation->RemapEmbeddedComputations(old_to_new));
-  }
-
-  return std::move(user_computation);
-}
-
-UserComputation::UserComputation(const string& name,
-                                 const ComputationHandle& handle)
-    : name_(name), next_handle_value_(1) {
-  *session_computation_.mutable_computation_handle() = handle;
-  session_computation_.set_name(name);
-
-  VLOG(1) << "New UserComputation \"" << name
-          << "\", handle: " << handle.handle();
-}
-
-ComputationDataHandle UserComputation::CreateComputationDataHandle() {
-  ComputationDataHandle handle;
-  handle.set_handle(next_handle_value_);
-  // Handles are used as Version values and *must* be assigned consecutively for
-  // computation versioning to work.
-  next_handle_value_++;
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddParameterInstruction(
-    const ParameterRequest& parameter_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  int64 parameter_number = parameter_request.parameter();
-  if (parameters_.count(parameter_number) != 0) {
-    return InvalidArgument("parameter %lld already registered",
-                           parameter_number);
-  }
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  const Shape& validated_shape = parameter_request.shape();
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape));
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = validated_shape;
-  *request.mutable_request()->mutable_parameter_request() = parameter_request;
-
-  parameters_[parameter_number] = &request;
-
-  VLOG(1) << "AddParameterInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << parameter_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddSendInstruction(
-    const SendRequest& send_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Check if the operand of the instruction is valid.
-  TF_RETURN_IF_ERROR(LookUpRequest(send_request.operand()).status());
-
-  // No handle is returned, but a handle must be assigned to this instruction
-  // for computation versioning.
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = ShapeUtil::MakeNil();
-  *request.mutable_request()->mutable_send_request() = send_request;
-
-  VLOG(1) << "AddSendInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << send_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddRecvInstruction(
-    const RecvRequest& recv_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  const Shape& shape = recv_request.shape();
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_recv_request() = recv_request;
-
-  VLOG(1) << "AddRecvInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << recv_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddPadInstruction(
-    const PadRequest& pad_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(pad_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* padding_value,
-                      LookUpRequest(pad_request.padding_value()));
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape, ShapeInference::InferPadShape(
-                                                operand->output_shape(),
-                                                padding_value->output_shape(),
-                                                pad_request.padding_config()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_pad_request() = pad_request;
-
-  VLOG(1) << "AddPadInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << pad_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConstantInstruction(
-    const ConstantRequest& constant_request) {
-  const Shape& validated_shape = constant_request.literal().shape();
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape));
-
-  tensorflow::mutex_lock lock(mutex_);
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = validated_shape;
-  *request.mutable_request()->mutable_constant_request() = constant_request;
-
-  VLOG(1) << "AddConstantInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddGatherInstruction(
-    const GatherRequest& gather_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* input_request,
-                      LookUpRequest(gather_request.input()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* gather_indices_request,
-                      LookUpRequest(gather_request.gather_indices()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape shape,
-      ShapeInference::InferGatherShape(
-          input_request->output_shape(), gather_indices_request->output_shape(),
-          gather_request.dimension_numbers(),
-          AsInt64Slice(gather_request.window_bounds())));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_gather_request() = gather_request;
-
-  VLOG(1) << "AddGatherInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << gather_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddGetTupleElementInstruction(
-    const GetTupleElementRequest& get_tuple_element_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(get_tuple_element_request.operand()));
-  if (!ShapeUtil::IsTuple(operand->output_shape())) {
-    return InvalidArgument(
-        "Operand to GetTupleElement() is not a tuple; got %s",
-        ShapeUtil::HumanString(operand->output_shape()).c_str());
-  }
-  Shape element_shape = ShapeUtil::GetTupleElementShape(
-      operand->output_shape(), get_tuple_element_request.index());
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = element_shape;
-  *request.mutable_request()->mutable_get_tuple_element_request() =
-      get_tuple_element_request;
-
-  VLOG(1) << "AddGetTupleElementInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << get_tuple_element_request.ShortDebugString();
-  return handle;
-}
-
-Status UserComputation::AddTraceInstruction(const TraceRequest& trace_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Verify that the operand index is valid.
-  TF_RETURN_IF_ERROR(LookUpRequest(trace_request.operand()).status());
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = ShapeUtil::MakeNil();
-  *request.mutable_request()->mutable_trace_request() = trace_request;
-
-  VLOG(1) << "AddTraceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << trace_request.ShortDebugString();
-  return Status::OK();
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddRngInstruction(
-    const RngRequest& rng_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Check the number of parameters per RNG distribution.
-  switch (rng_request.distribution()) {
-    case RandomDistribution::RNG_NORMAL:
-    case RandomDistribution::RNG_UNIFORM:
-      if (rng_request.parameter_size() != 2) {
-        return InvalidArgument(
-            "RNG distribution (%s) expects 2 parameters, but got %d",
-            RandomDistribution_Name(rng_request.distribution()).c_str(),
-            rng_request.parameter_size());
-      }
-      break;
-    default:
-      LOG(FATAL) << "unhandled distribution " << rng_request.distribution();
-  }
-
-  // Verify that the parameter indices are valid;
-  for (const ComputationDataHandle& param : rng_request.parameter()) {
-    TF_RETURN_IF_ERROR(LookUpRequest(param).status());
-  }
-  const Shape& validated_shape = rng_request.shape();
-  TF_RETURN_IF_ERROR(
-      ShapeUtil::ValidateShapeWithOptionalLayout(validated_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = validated_shape;
-  *request.mutable_request()->mutable_rng_request() = rng_request;
-
-  VLOG(1) << "AddRngInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << rng_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddMapInstruction(
-    const MapRequest& map_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : map_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferMapShape(operand_shapes, *to_apply_program_shape,
-                                    AsInt64Slice(map_request.dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_map_request() = map_request;
-
-  VLOG(1) << "AddMapInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << map_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReduceInstruction(
-    const ReduceRequest& reduce_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reduce_request.operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookUpRequest(reduce_request.init_value()));
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReduceShape(
-          operand->output_shape(), init_value->output_shape(),
-          AsInt64Slice(reduce_request.dimensions()), *to_apply_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_reduce_request() = reduce_request;
-
-  VLOG(1) << "AddReduceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reduce_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle>
-UserComputation::AddBatchNormTrainingInstruction(
-    const BatchNormTrainingRequest& batch_norm_training_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(batch_norm_training_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
-                      LookUpRequest(batch_norm_training_request.scale()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* offset,
-                      LookUpRequest(batch_norm_training_request.offset()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferBatchNormTrainingShape(
-          operand->output_shape(), scale->output_shape(),
-          offset->output_shape(), batch_norm_training_request.feature_index()));
-
-  *request.mutable_output_shape() = inferred_shape;
-
-  *request.mutable_output_handle() = handle;
-
-  *request.mutable_request()->mutable_batch_norm_training_request() =
-      batch_norm_training_request;
-
-  VLOG(1) << "AddBatchNormTrainingInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << batch_norm_training_request.ShortDebugString();
-
-  return handle;
-}
-
-StatusOr<ComputationDataHandle>
-UserComputation::AddBatchNormInferenceInstruction(
-    const BatchNormInferenceRequest& batch_norm_inference_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(batch_norm_inference_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
-                      LookUpRequest(batch_norm_inference_request.scale()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* offset,
-                      LookUpRequest(batch_norm_inference_request.offset()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* mean,
-                      LookUpRequest(batch_norm_inference_request.mean()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* variance,
-                      LookUpRequest(batch_norm_inference_request.variance()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferBatchNormInferenceShape(
-                          operand->output_shape(), scale->output_shape(),
-                          offset->output_shape(), mean->output_shape(),
-                          variance->output_shape(),
-                          batch_norm_inference_request.feature_index()));
-
-  *request.mutable_output_shape() = inferred_shape;
-
-  *request.mutable_output_handle() = handle;
-
-  *request.mutable_request()->mutable_batch_norm_inference_request() =
-      batch_norm_inference_request;
-
-  VLOG(1) << "AddBatchNormInferenceInstruction ("
-          << GetVersionedHandleInternal() << "), data handle "
-          << handle.handle() << ": "
-          << batch_norm_inference_request.ShortDebugString();
-
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBatchNormGradInstruction(
-    const BatchNormGradRequest& batch_norm_grad_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(batch_norm_grad_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* scale,
-                      LookUpRequest(batch_norm_grad_request.scale()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* mean,
-                      LookUpRequest(batch_norm_grad_request.mean()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* variance,
-                      LookUpRequest(batch_norm_grad_request.variance()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* grad_output,
-                      LookUpRequest(batch_norm_grad_request.grad_output()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferBatchNormGradShape(
-          operand->output_shape(), scale->output_shape(), mean->output_shape(),
-          variance->output_shape(), grad_output->output_shape(),
-          batch_norm_grad_request.feature_index()));
-
-  *request.mutable_output_shape() = inferred_shape;
-
-  *request.mutable_output_handle() = handle;
-
-  *request.mutable_request()->mutable_batch_norm_grad_request() =
-      batch_norm_grad_request;
-
-  VLOG(1) << "AddBatchNormGradInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << batch_norm_grad_request.ShortDebugString();
-
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReduceWindowInstruction(
-    const ReduceWindowRequest& reduce_window_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reduce_window_request.operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookUpRequest(reduce_window_request.init_value()));
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReduceWindowShape(
-          operand->output_shape(), init_value->output_shape(),
-          reduce_window_request.window(), *to_apply_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_reduce_window_request() =
-      reduce_window_request;
-
-  VLOG(1) << "AddReduceWindowInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reduce_window_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddSelectAndScatterInstruction(
-    const SelectAndScatterRequest& select_and_scatter_request,
-    const UserComputation& select_computation,
-    const UserComputation& scatter_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(select_and_scatter_request.operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* source,
-                      LookUpRequest(select_and_scatter_request.source()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init_value,
-                      LookUpRequest(select_and_scatter_request.init_value()));
-
-  VersionedComputationHandle::Version select_version =
-      select_computation.version();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<const ProgramShape> select_program_shape,
-                      select_computation.ComputeProgramShape(select_version));
-  VersionedComputationHandle::Version scatter_version =
-      scatter_computation.version();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<const ProgramShape> scatter_program_shape,
-                      scatter_computation.ComputeProgramShape(scatter_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferSelectAndScatterShape(
-          operand->output_shape(), *select_program_shape,
-          select_and_scatter_request.window(), source->output_shape(),
-          init_value->output_shape(), *scatter_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(select_version);
-  request.add_embedded_computation_versions(scatter_version);
-  *request.mutable_request()->mutable_select_and_scatter_request() =
-      select_and_scatter_request;
-
-  VLOG(1) << "AddSelectAndScatterInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << select_and_scatter_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReverseInstruction(
-    const ReverseRequest& reverse_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reverse_request.operand()));
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReverseShape(
-          operand->output_shape(), AsInt64Slice(reverse_request.dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_reverse_request() = reverse_request;
-  VLOG(1) << "AddReverseInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reverse_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddWhileInstruction(
-    const WhileRequest& while_request,
-    const UserComputation& condition_computation,
-    const UserComputation& body_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* init,
-                      LookUpRequest(while_request.init()));
-
-  VersionedComputationHandle::Version condition_version =
-      condition_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> condition_program_shape,
-      condition_computation.ComputeProgramShape(condition_version));
-
-  VersionedComputationHandle::Version body_version = body_computation.version();
-  TF_ASSIGN_OR_RETURN(std::shared_ptr<const ProgramShape> body_program_shape,
-                      body_computation.ComputeProgramShape(body_version));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferWhileShape(
-          *condition_program_shape, *body_program_shape, init->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(condition_version);
-  request.add_embedded_computation_versions(body_version);
-  *request.mutable_request()->mutable_while_request() = while_request;
-
-  VLOG(1) << "AddWhileInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << while_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConditionalInstruction(
-    const ConditionalRequest& conditional_request,
-    const UserComputation& true_computation,
-    const UserComputation& false_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* pred,
-                      LookUpRequest(conditional_request.predicate()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* true_operand,
-                      LookUpRequest(conditional_request.true_operand()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* false_operand,
-                      LookUpRequest(conditional_request.false_operand()));
-
-  VersionedComputationHandle::Version true_computation_version =
-      true_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> true_computation_shape,
-      true_computation.ComputeProgramShape(true_computation_version));
-
-  VersionedComputationHandle::Version false_computation_version =
-      false_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> false_computation_shape,
-      false_computation.ComputeProgramShape(false_computation_version));
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferConditionalShape(
-                          pred->output_shape(), true_operand->output_shape(),
-                          false_operand->output_shape(),
-                          *true_computation_shape, *false_computation_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(true_computation_version);
-  request.add_embedded_computation_versions(false_computation_version);
-  *request.mutable_request()->mutable_conditional_request() =
-      conditional_request;
-
-  VLOG(1) << "AddConditionalInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << conditional_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBroadcastInstruction(
-    const BroadcastRequest& broadcast_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Fetches and validates the operand.
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(broadcast_request.operand()));
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferBroadcastShape(
-                          operand->output_shape(),
-                          AsInt64Slice(broadcast_request.broadcast_sizes())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_broadcast_request() = broadcast_request;
-
-  VLOG(1) << "AddBroadcastInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << broadcast_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReshapeInstruction(
-    const ReshapeRequest& reshape_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Fetches and validates the operand.
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reshape_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferReshapeShape(
-          operand->output_shape(), AsInt64Slice(reshape_request.dimensions()),
-          AsInt64Slice(reshape_request.new_sizes())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_reshape_request() = reshape_request;
-
-  VLOG(1) << "AddReshapeInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reshape_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddTransposeInstruction(
-    const TransposeRequest& transpose_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Fetches and validates the operand.
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(transpose_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(Shape inferred_shape,
-                      ShapeInference::InferTransposeShape(
-                          operand->output_shape(),
-                          AsInt64Slice(transpose_request.dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  *request.mutable_request()->mutable_transpose_request() = transpose_request;
-
-  VLOG(1) << "AddTransposeInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << transpose_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddSliceInstruction(
-    const SliceRequest& slice_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(slice_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape new_shape,
-      ShapeInference::InferSliceShape(
-          operand->output_shape(), AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices()),
-          AsInt64Slice(slice_request.strides())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_slice_request() = slice_request;
-
-  VLOG(1) << "AddSliceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << slice_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddDynamicSliceInstruction(
-    const DynamicSliceRequest& dynamic_slice_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(dynamic_slice_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* start_indices,
-                      LookUpRequest(dynamic_slice_request.start_indices()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape new_shape,
-      ShapeInference::InferDynamicSliceShape(
-          operand->output_shape(), start_indices->output_shape(),
-          AsInt64Slice(dynamic_slice_request.slice_sizes())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_dynamic_slice_request() =
-      dynamic_slice_request;
-
-  VLOG(1) << "AddDynamicSliceInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << dynamic_slice_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle>
-UserComputation::AddDynamicUpdateSliceInstruction(
-    const DynamicUpdateSliceRequest& dynamic_update_slice_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(dynamic_update_slice_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* update,
-                      LookUpRequest(dynamic_update_slice_request.update()));
-
-  TF_ASSIGN_OR_RETURN(
-      const OperationRequest* start_indices,
-      LookUpRequest(dynamic_update_slice_request.start_indices()));
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape,
-                      ShapeInference::InferDynamicUpdateSliceShape(
-                          operand->output_shape(), update->output_shape(),
-                          start_indices->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_dynamic_update_slice_request() =
-      dynamic_update_slice_request;
-
-  VLOG(1) << "AddDynamicUpdateSliceInstruction ("
-          << GetVersionedHandleInternal() << "), data handle "
-          << handle.handle() << ": "
-          << dynamic_update_slice_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConcatenateInstruction(
-    const ConcatenateRequest& concatenate_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : concatenate_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape,
-                      ShapeInference::InferConcatOpShape(
-                          operand_shapes, concatenate_request.dimension()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_concatenate_request() =
-      concatenate_request;
-
-  VLOG(1) << "AddConcatenateInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << concatenate_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConvertInstruction(
-    const ConvertRequest& convert_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(convert_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
-                                           operand->output_shape(),
-                                           convert_request.new_element_type()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_convert_request() = convert_request;
-
-  VLOG(1) << "AddConvertInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << convert_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBitcastConvertInstruction(
-    const ConvertRequest& convert_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(convert_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(Shape new_shape, ShapeInference::InferConvertShape(
-                                           operand->output_shape(),
-                                           convert_request.new_element_type()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_bitcast_convert_request() =
-      convert_request;
-
-  VLOG(1) << "AddBitcastConvertInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << convert_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddReducePrecisionInstruction(
-    const ReducePrecisionRequest& reduce_precision_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(reduce_precision_request.operand()));
-
-  TF_ASSIGN_OR_RETURN(
-      Shape new_shape,
-      ShapeInference::InferReducePrecisionShape(
-          operand->output_shape(), reduce_precision_request.exponent_bits(),
-          reduce_precision_request.mantissa_bits()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = new_shape;
-  *request.mutable_request()->mutable_reduce_precision_request() =
-      reduce_precision_request;
-
-  VLOG(1) << "AddReducePrecisionInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << reduce_precision_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddConvolveInstruction(
-    const ConvolveRequest& convolve_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(convolve_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(convolve_request.rhs()));
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvolveShape(
-                                       lhs->output_shape(), rhs->output_shape(),
-                                       convolve_request.window(),
-                                       convolve_request.dimension_numbers()));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_convolve_request() = convolve_request;
-
-  VLOG(1) << "AddConvolveInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << convolve_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddFftInstruction(
-    const FftRequest& fft_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(fft_request.operand()));
-  TF_ASSIGN_OR_RETURN(Shape shape,
-                      ShapeInference::InferFftShape(
-                          operand->output_shape(), fft_request.fft_type(),
-                          AsInt64Slice(fft_request.fft_length())));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_fft_request() = fft_request;
-
-  VLOG(1) << "AddFftInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << fft_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddCrossReplicaSumInstruction(
-    const CrossReplicaSumRequest& cross_replica_sum_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(cross_replica_sum_request.operand()));
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
-                                       {&operand->output_shape()}));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_cross_replica_sum_request() =
-      cross_replica_sum_request;
-
-  VLOG(1) << "AddCrossreplicaSumInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << cross_replica_sum_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddInfeedInstruction(
-    const InfeedRequest& infeed_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  const Shape& shape = infeed_request.shape();
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Given shape to Infeed must have a layout");
-  }
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_infeed_request() = infeed_request;
-
-  VLOG(1) << "AddInfeedInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << infeed_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddOutfeedInstruction(
-    const OutfeedRequest& outfeed_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  const Shape& shape = outfeed_request.shape();
-  if (!LayoutUtil::HasLayout(shape)) {
-    return InvalidArgument("Given shape to Outfeed must have a layout");
-  }
-
-  // Verify that operand is valid.
-  TF_RETURN_IF_ERROR(LookUpRequest(outfeed_request.operand()).status());
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_outfeed_request() = outfeed_request;
-
-  VLOG(1) << "AddOutfeedInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << outfeed_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddCallInstruction(
-    const CallRequest& call_request,
-    const UserComputation& to_apply_computation) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : call_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  VersionedComputationHandle::Version to_apply_version =
-      to_apply_computation.version();
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<const ProgramShape> to_apply_program_shape,
-      to_apply_computation.ComputeProgramShape(to_apply_version));
-  TF_ASSIGN_OR_RETURN(
-      Shape inferred_shape,
-      ShapeInference::InferCallShape(operand_shapes, *to_apply_program_shape));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = inferred_shape;
-  request.add_embedded_computation_versions(to_apply_version);
-  *request.mutable_request()->mutable_call_request() = call_request;
-
-  VLOG(1) << "AddCallInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << call_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddCustomCallInstruction(
-    const CustomCallRequest& custom_call_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  for (const ComputationDataHandle& handle : custom_call_request.operands()) {
-    TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
-  }
-
-  if (tensorflow::str_util::StartsWith(custom_call_request.call_target_name(),
-                                       "$")) {
-    return InvalidArgument(
-        "Invalid custom_call_target \"%s\": Call targets that start with '$' "
-        "are reserved for internal use.",
-        custom_call_request.call_target_name().c_str());
-  }
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = custom_call_request.shape();
-  *request.mutable_request()->mutable_custom_call_request() =
-      custom_call_request;
-
-  VLOG(1) << "AddCustomCallInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << custom_call_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddHostComputeInstruction(
-    const HostComputeRequest& host_compute_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  for (const ComputationDataHandle& handle : host_compute_request.operands()) {
-    TF_RETURN_IF_ERROR(LookUpRequest(handle).status());
-  }
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = host_compute_request.shape();
-  *request.mutable_request()->mutable_host_compute_request() =
-      host_compute_request;
-
-  VLOG(1) << "AddHostComputeInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << host_compute_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddDotInstruction(
-    const DotRequest& dot_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(dot_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(dot_request.rhs()));
-
-  TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferDotOpShape(
-                                       lhs->output_shape(), rhs->output_shape(),
-                                       dot_request.dimension_numbers()));
-
-  const ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_dot_request() = dot_request;
-
-  VLOG(1) << "AddDotInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << dot_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddUnaryInstruction(
-    const UnaryOpRequest& unary_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand,
-                      LookUpRequest(unary_request.operand()));
-  TF_ASSIGN_OR_RETURN(
-      Shape shape, ShapeInference::InferUnaryOpShape(unary_request.unop(),
-                                                     operand->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_unary_op_request() = unary_request;
-
-  VLOG(1) << "AddUnaryInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << unary_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddBinaryInstruction(
-    const BinaryOpRequest& binary_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(binary_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(binary_request.rhs()));
-  TF_ASSIGN_OR_RETURN(
-      Shape shape,
-      ShapeInference::InferBinaryOpShape(
-          binary_request.binop(), lhs->output_shape(), rhs->output_shape(),
-          AsInt64Slice(binary_request.broadcast_dimensions())));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_binary_op_request() = binary_request;
-
-  VLOG(1) << "AddBinaryInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << binary_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddTernaryInstruction(
-    const TernaryOpRequest& ternary_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* lhs,
-                      LookUpRequest(ternary_request.lhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* rhs,
-                      LookUpRequest(ternary_request.rhs()));
-  TF_ASSIGN_OR_RETURN(const OperationRequest* ehs,
-                      LookUpRequest(ternary_request.ehs()));
-  TF_ASSIGN_OR_RETURN(Shape shape,
-                      ShapeInference::InferTernaryOpShape(
-                          ternary_request.triop(), lhs->output_shape(),
-                          rhs->output_shape(), ehs->output_shape()));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_ternary_op_request() = ternary_request;
-
-  VLOG(1) << "AddTernaryInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << ternary_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<ComputationDataHandle> UserComputation::AddVariadicInstruction(
-    const VariadicOpRequest& variadic_request) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  std::vector<const Shape*> operand_shapes;
-  for (const ComputationDataHandle& handle : variadic_request.operands()) {
-    TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-    operand_shapes.push_back(&operand->output_shape());
-  }
-
-  TF_ASSIGN_OR_RETURN(Shape shape,
-                      ShapeInference::InferVariadicOpShape(
-                          variadic_request.varop(), operand_shapes));
-
-  ComputationDataHandle handle = CreateComputationDataHandle();
-
-  OperationRequest& request =
-      (*session_computation_.mutable_requests())[handle.handle()];
-  *request.mutable_output_handle() = handle;
-  *request.mutable_output_shape() = shape;
-  *request.mutable_request()->mutable_variadic_op_request() = variadic_request;
-
-  VLOG(1) << "AddVariadicInstruction (" << GetVersionedHandleInternal()
-          << "), data handle " << handle.handle() << ": "
-          << variadic_request.ShortDebugString();
-  return handle;
-}
-
-StatusOr<Shape> UserComputation::GetShape(const ComputationDataHandle& handle) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* operand, LookUpRequest(handle));
-  return operand->output_shape();
-}
-
-Status UserComputation::SetOpMetadata(const ComputationDataHandle& handle,
-                                      const OpMetadata& metadata) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  int64 handle_value = handle.handle();
-  if (session_computation_.requests().count(handle_value) == 0) {
-    return InvalidArgument("Invalid handle in SetOpMetadata (%lld)",
-                           handle_value);
-  }
-  *session_computation_.mutable_requests()
-       ->at(handle_value)
-       .mutable_request()
-       ->mutable_metadata() = metadata;
-  return Status::OK();
-}
-
-Status UserComputation::SetOpSharding(const ComputationDataHandle& handle,
-                                      const OpSharding& sharding) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  int64 handle_value = handle.handle();
-  if (session_computation_.requests().count(handle_value) == 0) {
-    return InvalidArgument("Invalid handle in SetOpSharding (%lld)",
-                           handle_value);
-  }
-  *session_computation_.mutable_requests()
-       ->at(handle_value)
-       .mutable_request()
-       ->mutable_sharding() = sharding;
-  return Status::OK();
-}
-
-Status UserComputation::SetReturnValue(const ComputationDataHandle& handle) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  if (!(handle.handle() > 0 && handle.handle() < next_handle_value_)) {
-    return InvalidArgument("Invalid handle in SetReturnValue");
-  }
-
-  handle_to_return_ = handle;
-
-  VLOG(1) << "SetReturnValue of computation \"" << name() << "\" fixed to "
-          << GetVersionedHandleInternal();
-
-  return Status::OK();
-}
-
-VersionedComputationHandle UserComputation::GetVersionedHandle() const {
-  tensorflow::mutex_lock lock(mutex_);
-  return GetVersionedHandleInternal();
-}
-
-VersionedComputationHandle UserComputation::GetVersionedHandleInternal() const {
-  VersionedComputationHandle versioned_handle;
-  versioned_handle.handle = session_computation_.computation_handle();
-
-  if (handle_to_return_.handle() > 0) {
-    // A specific handle has been requested for the result of the computation.
-    versioned_handle.version = handle_to_return_.handle();
-  } else {
-    // A version value is simply the most recently assigned
-    // ComputationDataHandle value, ie the handle value of the root of the
-    // computation.
-    versioned_handle.version = next_handle_value_ - 1;
-  }
-
-  return versioned_handle;
-}
-
-VersionedComputationHandle UserComputation::GetVersionedHandleAtOperation(
-    const ComputationDataHandle& operation) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // The version at which an operation was added is simply the handle value of
-  // the ComputationDataHandle.
-  VersionedComputationHandle versioned_handle;
-  versioned_handle.handle = session_computation_.computation_handle();
-  versioned_handle.version = operation.handle();
-  return versioned_handle;
-}
-
-VersionedComputationHandle::Version UserComputation::version() const {
-  return GetVersionedHandle().version;
-}
-
-namespace {
-
-// Returns true if the operation type corresponding to the given opcase can be
-// the root of the computation.
-bool CanBeRoot(const OpRequest::OpCase& op_case) {
-  switch (op_case) {
-    case OpRequest::kTraceRequest:
-    case OpRequest::kSendRequest:
-    case OpRequest::kOutfeedRequest:
-      return false;
-    default:
-      return true;
-  }
-}
-
-// Returns a pointer to the operation with the given data handle value in the
-// given SessionComputation.
-StatusOr<const OperationRequest*> LookUpRequest(
-    int64 handle_value, const SessionComputation& session_computation) {
-  if (session_computation.requests().count(handle_value) == 0) {
-    return InvalidArgument("no ComputationDataHandle value %lld", handle_value);
-  }
-  return &session_computation.requests().at(handle_value);
-}
-
-// Returns the OperationRequest corresponding to the root (result) of the
-// session computation.
-StatusOr<const OperationRequest*> GetRoot(
-    VersionedComputationHandle::Version version,
-    const SessionComputation& session_computation) {
-  TF_RET_CHECK(version > 0);
-  // Not all instructions can be roots. Walk backwards from the operation
-  // indicated by this version until a valid root is found.
-  const OperationRequest* root_request = nullptr;
-  while (version > 0) {
-    TF_ASSIGN_OR_RETURN(root_request,
-                        LookUpRequest(version, session_computation));
-    if (CanBeRoot(root_request->request().op_case())) {
-      break;
-    }
-    version--;
-  }
-  if (version == 0) {
-    return InternalError("Computation contains no root operation");
-  }
-  return root_request;
-}
-
-}  // namespace
-
-StatusOr<std::shared_ptr<const ProgramShape>>
-UserComputation::ComputeProgramShape(
-    VersionedComputationHandle::Version version) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  TF_RET_CHECK(version > 0 && version < next_handle_value_);
-
-  if (program_shape_ == nullptr || program_shape_version_ != version) {
-    // ProgramShape has not been computed yet, or is for different
-    // version. Compute it now.
-    TF_RETURN_IF_ERROR(CheckParametersAreContiguous(version));
-
-    auto program_shape = MakeUnique<ProgramShape>();
-    for (int64 request_num = 1; request_num <= version; ++request_num) {
-      const OperationRequest& request =
-          session_computation_.requests().at(request_num);
-      if (request.request().op_case() == OpRequest::kParameterRequest) {
-        const ParameterRequest& parameter_request =
-            request.request().parameter_request();
-        int64 param_no = parameter_request.parameter();
-        // Parameters may be out of order so expand ProgramShape parameters
-        // until it is at least large enough to hold the current parameter
-        // number.
-        while (program_shape->parameters_size() <= param_no) {
-          program_shape->add_parameters();
-          program_shape->add_parameter_names();
-        }
-        *program_shape->mutable_parameters(param_no) = request.output_shape();
-        *program_shape->mutable_parameter_names(param_no) =
-            parameter_request.name();
-      }
-    }
-
-    // The root determines the output shape.
-    TF_ASSIGN_OR_RETURN(const OperationRequest* root_request,
-                        GetRoot(version, session_computation_));
-    *program_shape->mutable_result() = root_request->output_shape();
-    if (ShapeUtil::IsOpaque(program_shape->result())) {
-      return Unimplemented("Computation results cannot be opaque");
-    }
-
-    program_shape_ = std::move(program_shape);
-    program_shape_version_ = version;
-  }
-
-  return program_shape_;
-}
-
-namespace {
-
-// A visitor which checks whether an operation is pure functional meaning that
-// it doesn't depend on any parameter with an index higher then num_parameters.
-// The visitor walks the computation starting at a given operation and sets
-// is_functional to false iff a parameter or RNG operation is encountered.
-void PureFunctionalVisitor(const SessionComputation& session_computation,
-                           const ComputationDataHandle& handle,
-                           int64 num_parameters, std::set<int64>* visited,
-                           bool* is_functional) {
-  if (visited->count(handle.handle()) != 0 || !*is_functional) {
-    return;
-  }
-
-  const OperationRequest& request =
-      session_computation.requests().at(handle.handle());
-  switch (request.request().op_case()) {
-    case OpRequest::kRngRequest:
-      *is_functional = false;
-      break;
-
-    case OpRequest::kConstantRequest:
-      break;
-
-    case OpRequest::kGetTupleElementRequest: {
-      const GetTupleElementRequest& get_tuple_element_request =
-          request.request().get_tuple_element_request();
-      PureFunctionalVisitor(session_computation,
-                            get_tuple_element_request.operand(), num_parameters,
-                            visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kSliceRequest: {
-      const SliceRequest& slice_request = request.request().slice_request();
-      PureFunctionalVisitor(session_computation, slice_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kDynamicSliceRequest: {
-      const DynamicSliceRequest& dynamic_slice_request =
-          request.request().dynamic_slice_request();
-      PureFunctionalVisitor(session_computation,
-                            dynamic_slice_request.operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            dynamic_slice_request.start_indices(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kDynamicUpdateSliceRequest: {
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
-          request.request().dynamic_update_slice_request();
-      PureFunctionalVisitor(session_computation,
-                            dynamic_update_slice_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            dynamic_update_slice_request.update(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            dynamic_update_slice_request.start_indices(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kConcatenateRequest: {
-      const ConcatenateRequest& concatenate_request =
-          request.request().concatenate_request();
-      for (const ComputationDataHandle& handle :
-           concatenate_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      break;
-    }
-
-    case OpRequest::kConvolveRequest: {
-      const ConvolveRequest& convolve_request =
-          request.request().convolve_request();
-      PureFunctionalVisitor(session_computation, convolve_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, convolve_request.rhs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kFftRequest: {
-      const FftRequest& fft_request = request.request().fft_request();
-      PureFunctionalVisitor(session_computation, fft_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kCrossReplicaSumRequest: {
-      // TODO(b/33009255): Implmement constant folding for cross replica sum.
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kInfeedRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kOutfeedRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kHostComputeRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kCallRequest: {
-      const CallRequest& call_request = request.request().call_request();
-      for (const ComputationDataHandle& handle : call_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      // TODO(b/32495713): We aren't checking the to_apply computation itself,
-      // so we conservatively say that computations containing the Call op
-      // cannot be constant.  We cannot set is_functional=false in other similar
-      // cases since we're already relying on IsConstant to return true.
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kCustomCallRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kDotRequest: {
-      const DotRequest& dot_request = request.request().dot_request();
-      PureFunctionalVisitor(session_computation, dot_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, dot_request.rhs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kSendRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kRecvRequest: {
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kMapRequest: {
-      const MapRequest& map_request = request.request().map_request();
-      for (const ComputationDataHandle& handle : map_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      // TODO(b/32495713): We aren't checking the to_apply computation itself.
-      break;
-    }
-
-    case OpRequest::kReduceRequest: {
-      const ReduceRequest& reduce_request = request.request().reduce_request();
-      PureFunctionalVisitor(session_computation, reduce_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, reduce_request.init_value(),
-                            num_parameters, visited, is_functional);
-      // TODO(b/32495713): We aren't checking the to_apply computation itself.
-      break;
-    }
-
-    case OpRequest::kReduceWindowRequest: {
-      const ReduceWindowRequest& reduce_window_request =
-          request.request().reduce_window_request();
-      PureFunctionalVisitor(session_computation,
-                            reduce_window_request.operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            reduce_window_request.init_value(), num_parameters,
-                            visited, is_functional);
-      // TODO(b/32495713): We aren't checking the to_apply computation itself.
-      break;
-    }
-
-    case OpRequest::kSelectAndScatterRequest: {
-      const SelectAndScatterRequest& select_and_scatter_request =
-          request.request().select_and_scatter_request();
-      PureFunctionalVisitor(session_computation,
-                            select_and_scatter_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            select_and_scatter_request.source(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            select_and_scatter_request.init_value(),
-                            num_parameters, visited, is_functional);
-      // TODO(b/32495713): We aren't checking the select and scatter
-      // computations themselves.
-      break;
-    }
-
-    case OpRequest::kBroadcastRequest: {
-      const BroadcastRequest& broadcast_request =
-          request.request().broadcast_request();
-      PureFunctionalVisitor(session_computation, broadcast_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kReshapeRequest: {
-      const ReshapeRequest& reshape_request =
-          request.request().reshape_request();
-      PureFunctionalVisitor(session_computation, reshape_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kReverseRequest: {
-      const ReverseRequest& reverse_request =
-          request.request().reverse_request();
-      PureFunctionalVisitor(session_computation, reverse_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kPadRequest: {
-      const PadRequest& pad_request = request.request().pad_request();
-      PureFunctionalVisitor(session_computation, pad_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, pad_request.padding_value(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kParameterRequest: {
-      const ParameterRequest& parameter_request =
-          request.request().parameter_request();
-      if (parameter_request.parameter() >= num_parameters) {
-        *is_functional = false;
-      }
-      break;
-    }
-
-    case OpRequest::kConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().convert_request();
-      PureFunctionalVisitor(session_computation, convert_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBitcastConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().bitcast_convert_request();
-      PureFunctionalVisitor(session_computation, convert_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kWhileRequest: {
-      const WhileRequest& while_request = request.request().while_request();
-      PureFunctionalVisitor(session_computation, while_request.init(),
-                            num_parameters, visited, is_functional);
-      // TODO(b/32495713): We aren't checking the condition and body
-      // computations themselves.
-      *is_functional = false;
-      break;
-    }
-
-    case OpRequest::kConditionalRequest: {
-      const ConditionalRequest& conditional_request =
-          request.request().conditional_request();
-      PureFunctionalVisitor(session_computation,
-                            conditional_request.predicate(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            conditional_request.true_operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            conditional_request.false_operand(), num_parameters,
-                            visited, is_functional);
-      // TODO(b/32495713): We aren't checking the true and false computations
-      // themselves.
-      break;
-    }
-
-    case OpRequest::kTernaryOpRequest: {
-      const TernaryOpRequest& ternary_op_request =
-          request.request().ternary_op_request();
-      PureFunctionalVisitor(session_computation, ternary_op_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, ternary_op_request.rhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, ternary_op_request.ehs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kTransposeRequest: {
-      const TransposeRequest& transpose_request =
-          request.request().transpose_request();
-      PureFunctionalVisitor(session_computation, transpose_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kVariadicOpRequest: {
-      const VariadicOpRequest& variadic_op_request =
-          request.request().variadic_op_request();
-      for (const ComputationDataHandle& handle :
-           variadic_op_request.operands()) {
-        PureFunctionalVisitor(session_computation, handle, num_parameters,
-                              visited, is_functional);
-      }
-      break;
-    }
-
-    case OpRequest::kUnaryOpRequest: {
-      const UnaryOpRequest& unary_op_request =
-          request.request().unary_op_request();
-      PureFunctionalVisitor(session_computation, unary_op_request.operand(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBatchNormTrainingRequest: {
-      const BatchNormTrainingRequest& batch_norm_training_request =
-          request.request().batch_norm_training_request();
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_training_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_training_request.scale(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_training_request.offset(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBatchNormInferenceRequest: {
-      const BatchNormInferenceRequest& batch_norm_inference_request =
-          request.request().batch_norm_inference_request();
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.operand(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.scale(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.offset(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.mean(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_inference_request.variance(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBatchNormGradRequest: {
-      const BatchNormGradRequest& batch_norm_grad_request =
-          request.request().batch_norm_grad_request();
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.operand(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.scale(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation, batch_norm_grad_request.mean(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.variance(), num_parameters,
-                            visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            batch_norm_grad_request.grad_output(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kBinaryOpRequest: {
-      const BinaryOpRequest& binary_op_request =
-          request.request().binary_op_request();
-      PureFunctionalVisitor(session_computation, binary_op_request.lhs(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation, binary_op_request.rhs(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::kGatherRequest: {
-      PureFunctionalVisitor(session_computation,
-                            request.request().gather_request().input(),
-                            num_parameters, visited, is_functional);
-      PureFunctionalVisitor(session_computation,
-                            request.request().gather_request().gather_indices(),
-                            num_parameters, visited, is_functional);
-      break;
-    }
-
-    case OpRequest::OP_NOT_SET:
-      LOG(FATAL) << "OperationRequest doesn't contain a request";
-
-    default:
-      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
-  }
-  if (!*is_functional) {
-    VLOG(1) << "Non-functional: " << request.request().DebugString();
-  }
-  visited->insert(handle.handle());
-}
-
-}  // namespace
-
-StatusOr<bool> UserComputation::IsConstant(const ComputationDataHandle& handle,
-                                           int64 num_parameters) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  // Verify that the handle is valid.
-  auto operation_status = LookUpRequest(handle);
-  if (!operation_status.ok()) {
-    return operation_status.status();
-  }
-
-  bool is_constant = true;
-  std::set<int64> visited;
-  PureFunctionalVisitor(session_computation_, handle, num_parameters, &visited,
-                        &is_constant);
-
-  return is_constant;
-}
-
-std::vector<VersionedComputationHandle>
-UserComputation::GetEmbeddedComputations(
-    VersionedComputationHandle::Version version) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  VLOG(1)
-      << "GetEmbeddedComputations(" << name() << " "
-      << VersionedComputationHandle{session_computation_.computation_handle(),
-                                    version}
-      << ")";
-  XLA_VLOG_LINES(3, session_computation_.DebugString());
-
-  std::vector<VersionedComputationHandle> computations;
-  std::vector<int64> sorted_handles;
-  for (const auto& handle_request : session_computation_.requests()) {
-    sorted_handles.push_back(handle_request.first);
-  }
-  std::sort(sorted_handles.begin(), sorted_handles.end());
-  for (int64 handle : sorted_handles) {
-    const auto& handle_request = session_computation_.requests().find(handle);
-    CHECK(handle_request != session_computation_.requests().end());
-    int64 handle_value = handle_request->first;
-    if (handle_value <= version) {
-      const OperationRequest& request = handle_request->second;
-      switch (request.request().op_case()) {
-        case OpRequest::kCallRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const CallRequest& call_request = request.request().call_request();
-          const VersionedComputationHandle versioned_handle = {
-              call_request.to_apply(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kMapRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const MapRequest& map_request = request.request().map_request();
-          const VersionedComputationHandle versioned_handle = {
-              map_request.to_apply(), request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kReduceRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const ReduceRequest& reduce_request =
-              request.request().reduce_request();
-          const VersionedComputationHandle versioned_handle = {
-              reduce_request.to_apply(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kReduceWindowRequest: {
-          CHECK_EQ(1, request.embedded_computation_versions_size());
-          const ReduceWindowRequest& reduce_window_request =
-              request.request().reduce_window_request();
-          const VersionedComputationHandle versioned_handle = {
-              reduce_window_request.to_apply(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(versioned_handle);
-          break;
-        }
-
-        case OpRequest::kSelectAndScatterRequest: {
-          CHECK_EQ(2, request.embedded_computation_versions_size());
-          const SelectAndScatterRequest& select_and_scatter_request =
-              request.request().select_and_scatter_request();
-          const VersionedComputationHandle select_versioned_handle = {
-              select_and_scatter_request.select(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(select_versioned_handle);
-          const VersionedComputationHandle scatter_versioned_handle = {
-              select_and_scatter_request.scatter(),
-              request.embedded_computation_versions(1)};
-          computations.push_back(scatter_versioned_handle);
-          break;
-        }
-
-        case OpRequest::kWhileRequest: {
-          CHECK_EQ(2, request.embedded_computation_versions_size());
-          const WhileRequest& while_request = request.request().while_request();
-          const VersionedComputationHandle condition_versioned_handle = {
-              while_request.condition(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(condition_versioned_handle);
-          const VersionedComputationHandle body_versioned_handle = {
-              while_request.body(), request.embedded_computation_versions(1)};
-          computations.push_back(body_versioned_handle);
-          break;
-        }
-
-        case OpRequest::kConditionalRequest: {
-          CHECK_EQ(2, request.embedded_computation_versions_size());
-          const ConditionalRequest& conditional_request =
-              request.request().conditional_request();
-          const VersionedComputationHandle true_computation_versioned_handle = {
-              conditional_request.true_computation(),
-              request.embedded_computation_versions(0)};
-          computations.push_back(true_computation_versioned_handle);
-          const VersionedComputationHandle false_computation_versioned_handle =
-              {conditional_request.false_computation(),
-               request.embedded_computation_versions(1)};
-          computations.push_back(false_computation_versioned_handle);
-          break;
-        }
-
-        default:
-          // No embedded computation.
-          break;
-      }
-    }
-  }
-  VLOG(2) << "Embedded computations: "
-          << tensorflow::str_util::Join(
-                 computations, ", ",
-                 [](string* out, const VersionedComputationHandle& h) {
-                   out->append(h.ToString());
-                 });
-  return computations;
-}
-
-StatusOr<const OperationRequest*>
-UserComputation::LookUpRequestForErrorReporting(
-    const ComputationDataHandle& handle) const {
-  tensorflow::mutex_lock lock(mutex_);
-  return LookUpRequest(handle);
-}
-
-tensorflow::gtl::optional<const OpMetadata*> UserComputation::ParameterMetadata(
-    int parameter_number) const {
-  tensorflow::mutex_lock lock(mutex_);
-  auto it = parameters_.find(parameter_number);
-  if (it == parameters_.end()) {
-    return tensorflow::gtl::nullopt;
-  }
-  OperationRequest* op = it->second;
-  return &op->request().metadata();
-}
-
-Status UserComputation::RemapEmbeddedComputations(
-    const std::map<int64, ComputationHandle>& old_to_new) {
-  auto update = [&old_to_new](ComputationHandle* to_update) -> Status {
-    int64 old = to_update->handle();
-    auto it = old_to_new.find(old);
-    if (it == old_to_new.end()) {
-      string mapping = tensorflow::str_util::Join(
-          old_to_new, ", ",
-          [](string* out, std::pair<int64, ComputationHandle> element) {
-            tensorflow::strings::Appendf(out, "%lld:%lld", element.first,
-                                         element.second.handle());
-          });
-      return NotFound(
-          "could not find referenced (old) computation handle in mapping: "
-          "%lld; mapping: {%s}",
-          old, mapping.c_str());
-    }
-    VLOG(2) << "remapping " << old << " to " << it->second.handle();
-    *to_update = it->second;
-    return Status::OK();
-  };
-  TF_RETURN_IF_ERROR(update(session_computation_.mutable_computation_handle()));
-  for (auto& handle_request : *session_computation_.mutable_requests()) {
-    OperationRequest& request = handle_request.second;
-    switch (request.request().op_case()) {
-      case OpRequest::kCallRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        CallRequest* call_request =
-            request.mutable_request()->mutable_call_request();
-        TF_RETURN_IF_ERROR(update(call_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kMapRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        MapRequest* map_request =
-            request.mutable_request()->mutable_map_request();
-        TF_RETURN_IF_ERROR(update(map_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kReduceRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        ReduceRequest* reduce_request =
-            request.mutable_request()->mutable_reduce_request();
-        TF_RETURN_IF_ERROR(update(reduce_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kReduceWindowRequest: {
-        TF_RET_CHECK(1 == request.embedded_computation_versions_size());
-        ReduceWindowRequest* reduce_window_request =
-            request.mutable_request()->mutable_reduce_window_request();
-        TF_RETURN_IF_ERROR(update(reduce_window_request->mutable_to_apply()));
-        break;
-      }
-      case OpRequest::kSelectAndScatterRequest: {
-        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
-        SelectAndScatterRequest* select_and_scatter_request =
-            request.mutable_request()->mutable_select_and_scatter_request();
-        TF_RETURN_IF_ERROR(
-            update(select_and_scatter_request->mutable_select()));
-        TF_RETURN_IF_ERROR(
-            update(select_and_scatter_request->mutable_scatter()));
-        break;
-      }
-      case OpRequest::kWhileRequest: {
-        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
-        WhileRequest* while_request =
-            request.mutable_request()->mutable_while_request();
-        TF_RETURN_IF_ERROR(update(while_request->mutable_condition()));
-        TF_RETURN_IF_ERROR(update(while_request->mutable_body()));
-        break;
-      }
-      case OpRequest::kConditionalRequest: {
-        TF_RET_CHECK(2 == request.embedded_computation_versions_size());
-        ConditionalRequest* conditional_request =
-            request.mutable_request()->mutable_conditional_request();
-        TF_RETURN_IF_ERROR(
-            update(conditional_request->mutable_true_computation()));
-        TF_RETURN_IF_ERROR(
-            update(conditional_request->mutable_false_computation()));
-        break;
-      }
-      default:
-        // No embedded computation.
-        TF_RET_CHECK(0 == request.embedded_computation_versions_size());
-        break;
-    }
-  }
-  return Status::OK();
-}
-
-SessionComputation UserComputation::CloneSessionComputation(
-    VersionedComputationHandle::Version version) const {
-  tensorflow::mutex_lock lock(mutex_);
-  SessionComputation result = session_computation_;
-  // Erase all the requests that exceed the version specified.
-  // There's no lower_bound method on tensorflow::protobuf::Map so we iterate
-  // all the elements.
-  auto it = result.mutable_requests()->begin();
-  while (it != result.mutable_requests()->end()) {
-    if (it->first > version) {
-      it = result.mutable_requests()->erase(it);
-    } else {
-      ++it;
-    }
-  }
-  return result;
-}
-
-StatusOr<const OperationRequest*> UserComputation::LookUpRequest(
-    const ComputationDataHandle& handle) const {
-  int64 handle_value = handle.handle();
-  if (session_computation_.requests().count(handle_value) == 0) {
-    return InvalidArgument("no ComputationDataHandle value %lld", handle_value);
-  }
-  return &session_computation_.requests().at(handle_value);
-}
-
-Status UserComputation::CheckParametersAreContiguous(
-    VersionedComputationHandle::Version version) const {
-  TF_RET_CHECK(version > 0 && version < next_handle_value_);
-
-  // Determine number of parameter inputs at the given version.
-  std::map<int64, const ParameterRequest*> parameter_requests;
-  for (int64 request_num = 1; request_num <= version; ++request_num) {
-    const OperationRequest& request =
-        session_computation_.requests().at(request_num);
-
-    if (request.request().op_case() == OpRequest::kParameterRequest) {
-      const ParameterRequest& parameter_request =
-          request.request().parameter_request();
-      // Duplicate parameters should be checked when parameter requests are
-      // added.
-      TF_RET_CHECK(0 ==
-                   parameter_requests.count(parameter_request.parameter()));
-      parameter_requests[parameter_request.parameter()] = &parameter_request;
-    }
-  }
-
-  for (int64 i = 0; i < parameter_requests.size(); ++i) {
-    auto it = parameter_requests.find(i);
-    if (it == parameter_requests.end()) {
-      return FailedPrecondition(
-          "computation %s does not have all its parameters populated "
-          "sequentially, missing parameter %lld",
-          name_.c_str(), i);
-    }
-  }
-
-  return Status::OK();
-}
-
-namespace {
-
-// Helper class which builds an HLO computation from a SessionComputation. To
-// construct the HLO computation, the SessionComputation graph is walked in
-// DFS order lowering each OperationRequest to an HLO instruction.
-class ComputationLowerer {
- public:
-  static StatusOr<std::unique_ptr<HloComputation>> Lower(
-      const string& computation_name,
-      const SessionComputation& session_computation,
-      VersionedComputationHandle::Version version,
-      UserComputation::HloComputationResolver hlo_resolver,
-      const DebugOptions& debug_options,
-      bool include_unreachable_instructions) {
-    ComputationLowerer lowerer(computation_name, session_computation, version,
-                               std::move(hlo_resolver), debug_options,
-                               include_unreachable_instructions);
-    return lowerer.Lower();
-  }
-
- private:
-  ComputationLowerer(const string& computation_name,
-                     const SessionComputation& session_computation,
-                     VersionedComputationHandle::Version version,
-                     UserComputation::HloComputationResolver hlo_resolver,
-                     const DebugOptions& debug_options,
-                     bool include_unreachable_instructions)
-      : hlo_builder_(computation_name),
-        session_computation_(session_computation),
-        version_(version),
-        hlo_resolver_(std::move(hlo_resolver)),
-        debug_options_(debug_options),
-        include_unreachable_instructions_(include_unreachable_instructions) {}
-
-  // Build an HLO computation from the SessionComputation at the given
-  // version.
-  StatusOr<std::unique_ptr<HloComputation>> Lower();
-
- private:
-  // Traverses the computation 'root' using a DFS, calling 'visit' in postorder.
-  void TraversePostorder(
-      const ComputationDataHandle& root,
-      std::unordered_map<int64, HloInstruction*>* visited,
-      const std::function<void(const ComputationDataHandle&)>& visit);
-
-  // DFS visitor of the UserComputation operations which lowers the operations
-  // to HLO instructions.
-  void Visit(const ComputationDataHandle& handle,
-             std::unordered_map<int64, HloInstruction*>* instructions);
-
-  // Resolves a ComputationHandle and Version to a previously lowered
-  // HloComputation using the hlo_resolver_ function.
-  HloComputation* ResolveComputation(
-      const ComputationHandle& handle,
-      VersionedComputationHandle::Version version);
-
-  // This function takes an input value which is being implicitly broadcast into
-  // an output shape and figures out the right kBroadcast instruction(s)
-  // necessary to replicate the implicit broadcast semantics explicitly.
-  HloInstruction* ImplicitBroadcastToExplicitBroadcast(
-      HloInstruction* operand, const Shape& output_shape);
-
-  HloComputation::Builder hlo_builder_;
-  const SessionComputation& session_computation_;
-  const VersionedComputationHandle::Version version_;
-  const UserComputation::HloComputationResolver hlo_resolver_;
-  const DebugOptions& debug_options_;
-  const bool include_unreachable_instructions_;
-};
-
-// Calls 'apply' on each operand of 'request'.
-static void ForEachOperand(
-    const OperationRequest& request,
-    const std::function<void(const ComputationDataHandle& param)>& apply) {
-  switch (request.request().op_case()) {
-    case OpRequest::kRngRequest: {
-      const RngRequest& rng_request = request.request().rng_request();
-      for (const ComputationDataHandle& param : rng_request.parameter()) {
-        apply(param);
-      }
-      break;
-    }
-
-    case OpRequest::kConstantRequest:
-      break;
-    case OpRequest::kGetTupleElementRequest: {
-      const GetTupleElementRequest& get_tuple_element_request =
-          request.request().get_tuple_element_request();
-      apply(get_tuple_element_request.operand());
-      break;
-    }
-
-    case OpRequest::kSliceRequest: {
-      const SliceRequest& slice_request = request.request().slice_request();
-      apply(slice_request.operand());
-      break;
-    }
-
-    case OpRequest::kDynamicSliceRequest: {
-      const DynamicSliceRequest& dynamic_slice_request =
-          request.request().dynamic_slice_request();
-      apply(dynamic_slice_request.operand());
-      apply(dynamic_slice_request.start_indices());
-      break;
-    }
-
-    case OpRequest::kDynamicUpdateSliceRequest: {
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
-          request.request().dynamic_update_slice_request();
-      apply(dynamic_update_slice_request.operand());
-      apply(dynamic_update_slice_request.update());
-      apply(dynamic_update_slice_request.start_indices());
-      break;
-    }
-
-    case OpRequest::kConcatenateRequest: {
-      const ConcatenateRequest& concatenate_request =
-          request.request().concatenate_request();
-      for (const ComputationDataHandle& handle :
-           concatenate_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kConvolveRequest: {
-      const ConvolveRequest& convolve_request =
-          request.request().convolve_request();
-      apply(convolve_request.lhs());
-      apply(convolve_request.rhs());
-      break;
-    }
-
-    case OpRequest::kFftRequest: {
-      const FftRequest& fft_request = request.request().fft_request();
-      apply(fft_request.operand());
-      break;
-    }
-
-    case OpRequest::kBatchNormTrainingRequest: {
-      const BatchNormTrainingRequest& batch_norm_training_request =
-          request.request().batch_norm_training_request();
-
-      apply(batch_norm_training_request.operand());
-      apply(batch_norm_training_request.scale());
-      apply(batch_norm_training_request.offset());
-      break;
-    }
-
-    case OpRequest::kBatchNormInferenceRequest: {
-      const BatchNormInferenceRequest& batch_norm_inference_request =
-          request.request().batch_norm_inference_request();
-
-      apply(batch_norm_inference_request.operand());
-      apply(batch_norm_inference_request.scale());
-      apply(batch_norm_inference_request.offset());
-      apply(batch_norm_inference_request.mean());
-      apply(batch_norm_inference_request.variance());
-      break;
-    }
-
-    case OpRequest::kBatchNormGradRequest: {
-      const BatchNormGradRequest& batch_norm_grad_request =
-          request.request().batch_norm_grad_request();
-
-      apply(batch_norm_grad_request.operand());
-      apply(batch_norm_grad_request.scale());
-      apply(batch_norm_grad_request.mean());
-      apply(batch_norm_grad_request.variance());
-      apply(batch_norm_grad_request.grad_output());
-      break;
-    }
-
-    case OpRequest::kCrossReplicaSumRequest: {
-      const CrossReplicaSumRequest& cross_replica_sum_request =
-          request.request().cross_replica_sum_request();
-      apply(cross_replica_sum_request.operand());
-      break;
-    }
-
-    case OpRequest::kInfeedRequest:
-      break;
-
-    case OpRequest::kOutfeedRequest: {
-      const OutfeedRequest& outfeed_request =
-          request.request().outfeed_request();
-      apply(outfeed_request.operand());
-      break;
-    }
-
-    case OpRequest::kMapRequest: {
-      const MapRequest& map_request = request.request().map_request();
-      for (const ComputationDataHandle& handle : map_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kReduceRequest: {
-      const ReduceRequest& reduce_request = request.request().reduce_request();
-      apply(reduce_request.operand());
-      apply(reduce_request.init_value());
-      break;
-    }
-
-    case OpRequest::kReduceWindowRequest: {
-      const ReduceWindowRequest& reduce_window_request =
-          request.request().reduce_window_request();
-      apply(reduce_window_request.operand());
-      apply(reduce_window_request.init_value());
-      break;
-    }
-
-    case OpRequest::kSelectAndScatterRequest: {
-      const SelectAndScatterRequest& select_and_scatter_request =
-          request.request().select_and_scatter_request();
-      apply(select_and_scatter_request.operand());
-      apply(select_and_scatter_request.source());
-      apply(select_and_scatter_request.init_value());
-
-      break;
-    }
-
-    case OpRequest::kBroadcastRequest: {
-      const BroadcastRequest& broadcast_request =
-          request.request().broadcast_request();
-      apply(broadcast_request.operand());
-      break;
-    }
-
-    case OpRequest::kReshapeRequest: {
-      const ReshapeRequest& reshape_request =
-          request.request().reshape_request();
-      apply(reshape_request.operand());
-      break;
-    }
-
-    case OpRequest::kTransposeRequest: {
-      const TransposeRequest& transpose_request =
-          request.request().transpose_request();
-      apply(transpose_request.operand());
-      break;
-    }
-
-    case OpRequest::kReverseRequest: {
-      const ReverseRequest& reverse_request =
-          request.request().reverse_request();
-      apply(reverse_request.operand());
-      break;
-    }
-
-    case OpRequest::kPadRequest: {
-      const PadRequest& pad_request = request.request().pad_request();
-      apply(pad_request.operand());
-      apply(pad_request.padding_value());
-      break;
-    }
-
-    case OpRequest::kRecvRequest:
-    case OpRequest::kParameterRequest:
-      break;
-
-    case OpRequest::kConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().convert_request();
-      apply(convert_request.operand());
-      break;
-    }
-
-    case OpRequest::kBitcastConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().bitcast_convert_request();
-      apply(convert_request.operand());
-      break;
-    }
-
-    case OpRequest::kWhileRequest: {
-      const WhileRequest& while_request = request.request().while_request();
-      apply(while_request.init());
-      break;
-    }
-
-    case OpRequest::kConditionalRequest: {
-      const ConditionalRequest& conditional_request =
-          request.request().conditional_request();
-      apply(conditional_request.predicate());
-      apply(conditional_request.true_operand());
-      apply(conditional_request.false_operand());
-      break;
-    }
-
-    case OpRequest::kTernaryOpRequest: {
-      const TernaryOpRequest& ternary_op_request =
-          request.request().ternary_op_request();
-      apply(ternary_op_request.lhs());
-      apply(ternary_op_request.rhs());
-      apply(ternary_op_request.ehs());
-      break;
-    }
-
-    case OpRequest::kVariadicOpRequest: {
-      const VariadicOpRequest& variadic_op_request =
-          request.request().variadic_op_request();
-      for (const ComputationDataHandle& handle :
-           variadic_op_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kCallRequest: {
-      const CallRequest& call_request = request.request().call_request();
-      for (const ComputationDataHandle& handle : call_request.operands()) {
-        apply(handle);
-      }
-      break;
-    }
-
-    case OpRequest::kCustomCallRequest: {
-      const CustomCallRequest& cc_request =
-          request.request().custom_call_request();
-      for (const ComputationDataHandle& operand : cc_request.operands()) {
-        apply(operand);
-      }
-      break;
-    }
-
-    case OpRequest::kHostComputeRequest: {
-      const HostComputeRequest& hc_request =
-          request.request().host_compute_request();
-      for (const ComputationDataHandle& operand : hc_request.operands()) {
-        apply(operand);
-      }
-      break;
-    }
-
-    case OpRequest::kDotRequest: {
-      const DotRequest& dot_request = request.request().dot_request();
-      apply(dot_request.rhs());
-      apply(dot_request.lhs());
-      break;
-    }
-
-    case OpRequest::kUnaryOpRequest: {
-      const UnaryOpRequest& unary_op_request =
-          request.request().unary_op_request();
-      apply(unary_op_request.operand());
-      break;
-    }
-
-    case OpRequest::kBinaryOpRequest: {
-      const BinaryOpRequest& binary_op_request =
-          request.request().binary_op_request();
-      apply(binary_op_request.rhs());
-      apply(binary_op_request.lhs());
-      break;
-    }
-
-    case OpRequest::kReducePrecisionRequest: {
-      const ReducePrecisionRequest& reduce_precision_request =
-          request.request().reduce_precision_request();
-      apply(reduce_precision_request.operand());
-      break;
-    }
-
-    case OpRequest::kTraceRequest: {
-      const TraceRequest& trace_request = request.request().trace_request();
-      apply(trace_request.operand());
-      break;
-    }
-
-    case OpRequest::kSendRequest: {
-      const SendRequest& send_request = request.request().send_request();
-      apply(send_request.operand());
-      break;
-    }
-
-    case OpRequest::kGatherRequest: {
-      const GatherRequest& gather_request = request.request().gather_request();
-      apply(gather_request.input());
-      apply(gather_request.gather_indices());
-      break;
-    }
-
-    case OpRequest::OP_NOT_SET:
-      LOG(FATAL) << "OperationRequest doesn't contain a request";
-
-    default:
-      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
-  }
-}
-
-void ComputationLowerer::TraversePostorder(
-    const ComputationDataHandle& root,
-    std::unordered_map<int64, HloInstruction*>* visited,
-    const std::function<void(const ComputationDataHandle&)>& visit) {
-  // Stack containing {handle, enter} pairs. The 'enter' value describes whether
-  // we are entering or leaving 'handle'.
-  std::stack<std::pair<ComputationDataHandle, bool>> work;
-  work.push({root, true});
-  while (!work.empty()) {
-    ComputationDataHandle handle;
-    bool enter;
-    std::tie(handle, enter) = work.top();
-    work.pop();
-
-    if (enter) {
-      // We are entering 'handle'. The first time we enter 'handle', we add it
-      // to 'visited' with a nullptr value. If 'handle' is already in 'visited',
-      // we do not visit it again. This algorithm only uses the presence of
-      // a handle in 'visited', but we use a map so we can use the same data
-      // structure to store the HloInstruction outputs.
-      if (visited->emplace(handle.handle(), nullptr).second) {
-        const OperationRequest& request =
-            session_computation_.requests().at(handle.handle());
-        // Push the corresponding 'leave' action onto the stack, followed by
-        // the operands.
-        work.push({handle, false});
-        ForEachOperand(request, [&work](const ComputationDataHandle& child) {
-          work.push({child, true});
-        });
-      }
-    } else {
-      // We are leaving 'handle'. We have visited the operands of 'handle', and
-      // now can visit the 'handle' itself.
-      visit(handle);
-    }
-  }
-}
-
-StatusOr<std::unique_ptr<HloComputation>> ComputationLowerer::Lower() {
-  // Map from ComputationDataHandle to HLO instruction. Serves as a record of
-  // which operations have been visited as well as a cache for looking up
-  // ComputationDataHandles as HloInstructions.
-  std::unordered_map<int64, HloInstruction*> instructions;
-
-  TF_ASSIGN_OR_RETURN(const OperationRequest* root_request,
-                      GetRoot(version_, session_computation_));
-
-  auto visit = [&](const ComputationDataHandle& handle) {
-    Visit(handle, &instructions);
-  };
-  TraversePostorder(root_request->output_handle(), &instructions, visit);
-  HloInstruction* hlo_root =
-      instructions.at(root_request->output_handle().handle());
-
-  if (include_unreachable_instructions_) {
-    // Iterate through all computation data handles, and visit any unvisited
-    // operations.
-    for (int64 request_num = 1; request_num <= version_; ++request_num) {
-      TF_ASSIGN_OR_RETURN(const OperationRequest* request,
-                          LookUpRequest(request_num, session_computation_));
-      TraversePostorder(request->output_handle(), &instructions, visit);
-    }
-  }
-
-  return hlo_builder_.Build(hlo_root);
-}
-
-HloComputation* ComputationLowerer::ResolveComputation(
-    const ComputationHandle& handle,
-    VersionedComputationHandle::Version version) {
-  const VersionedComputationHandle checked_handle = {handle, version};
-  return hlo_resolver_(checked_handle);
-}
-
-HloInstruction* ComputationLowerer::ImplicitBroadcastToExplicitBroadcast(
-    HloInstruction* operand, const Shape& output_shape) {
-  auto fadd = [this](std::unique_ptr<HloInstruction> x) {
-    return hlo_builder_.AddInstruction(std::move(x));
-  };
-  return fadd(
-      HloInstruction::CreateBroadcastSequence(output_shape, operand, fadd));
-}
-
-void ComputationLowerer::Visit(
-    const ComputationDataHandle& handle,
-    std::unordered_map<int64, HloInstruction*>* instructions) {
-  CHECK_LE(handle.handle(), version_);
-  CHECK(instructions->at(handle.handle()) == nullptr);
-  const OperationRequest& request =
-      session_computation_.requests().at(handle.handle());
-  auto add_instruction = [&](std::unique_ptr<HloInstruction> instruction) {
-    HloInstruction* hlo_instruction =
-        hlo_builder_.AddInstruction(std::move(instruction));
-    hlo_instruction->set_metadata(request.request().metadata());
-    if (request.request().has_sharding()) {
-      OpSharding op_sharding = request.request().sharding();
-      hlo_instruction->set_sharding(
-          HloSharding::FromProto(op_sharding).ValueOrDie());
-    }
-    return hlo_instruction;
-  };
-  auto lookup_instruction = [&](const ComputationDataHandle& handle) {
-    return instructions->at(handle.handle());
-  };
-  HloInstruction* hlo_instruction;
-  switch (request.request().op_case()) {
-    case OpRequest::kRngRequest: {
-      const RngRequest& rng_request = request.request().rng_request();
-      std::vector<HloInstruction*> parameters;
-      for (const ComputationDataHandle& param : rng_request.parameter()) {
-        parameters.push_back(lookup_instruction(param));
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateRng(
-          request.output_shape(), rng_request.distribution(), parameters));
-      break;
-    }
-
-    case OpRequest::kConstantRequest: {
-      const ConstantRequest& constant_request =
-          request.request().constant_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateConstant(
-          Literal::CreateFromProto(constant_request.literal())
-              .ConsumeValueOrDie()));
-      break;
-    }
-
-    case OpRequest::kGetTupleElementRequest: {
-      const GetTupleElementRequest& get_tuple_element_request =
-          request.request().get_tuple_element_request();
-      HloInstruction* operand =
-          lookup_instruction(get_tuple_element_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateGetTupleElement(
-          request.output_shape(), operand, get_tuple_element_request.index()));
-      break;
-    }
-
-    case OpRequest::kSliceRequest: {
-      const SliceRequest& slice_request = request.request().slice_request();
-      HloInstruction* operand = lookup_instruction(slice_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateSlice(
-          request.output_shape(), operand,
-          AsInt64Slice(slice_request.start_indices()),
-          AsInt64Slice(slice_request.limit_indices()),
-          AsInt64Slice(slice_request.strides())));
-      break;
-    }
-
-    case OpRequest::kDynamicSliceRequest: {
-      const DynamicSliceRequest& dynamic_slice_request =
-          request.request().dynamic_slice_request();
-      HloInstruction* operand =
-          lookup_instruction(dynamic_slice_request.operand());
-      HloInstruction* start_indices =
-          lookup_instruction(dynamic_slice_request.start_indices());
-
-      hlo_instruction = add_instruction(HloInstruction::CreateDynamicSlice(
-          request.output_shape(), operand, start_indices,
-          AsInt64Slice(dynamic_slice_request.slice_sizes())));
-      break;
-    }
-
-    case OpRequest::kDynamicUpdateSliceRequest: {
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request =
-          request.request().dynamic_update_slice_request();
-      HloInstruction* operand =
-          lookup_instruction(dynamic_update_slice_request.operand());
-      HloInstruction* update =
-          lookup_instruction(dynamic_update_slice_request.update());
-      HloInstruction* start_indices =
-          lookup_instruction(dynamic_update_slice_request.start_indices());
-      hlo_instruction =
-          add_instruction(HloInstruction::CreateDynamicUpdateSlice(
-              request.output_shape(), operand, update, start_indices));
-      break;
-    }
-
-    case OpRequest::kConcatenateRequest: {
-      const ConcatenateRequest& concatenate_request =
-          request.request().concatenate_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle :
-           concatenate_request.operands()) {
-        HloInstruction* operand = lookup_instruction(handle);
-        operands.push_back(operand);
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateConcatenate(
-          request.output_shape(), operands, concatenate_request.dimension()));
-      break;
-    }
-
-    case OpRequest::kConvolveRequest: {
-      const ConvolveRequest& convolve_request =
-          request.request().convolve_request();
-      HloInstruction* lhs = lookup_instruction(convolve_request.lhs());
-      HloInstruction* rhs = lookup_instruction(convolve_request.rhs());
-      hlo_instruction = add_instruction(HloInstruction::CreateConvolve(
-          request.output_shape(), lhs, rhs, convolve_request.window(),
-          convolve_request.dimension_numbers()));
-      break;
-    }
-
-    case OpRequest::kFftRequest: {
-      const FftRequest& fft_request = request.request().fft_request();
-      HloInstruction* operand = lookup_instruction(fft_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateFft(
-          request.output_shape(), operand, fft_request.fft_type(),
-          AsInt64Slice(fft_request.fft_length())));
-      break;
-    }
-
-    case OpRequest::kDotRequest: {
-      const DotRequest& dot_request = request.request().dot_request();
-      HloInstruction* lhs = lookup_instruction(dot_request.lhs());
-      HloInstruction* rhs = lookup_instruction(dot_request.rhs());
-      hlo_instruction = add_instruction(HloInstruction::CreateDot(
-          request.output_shape(), lhs, rhs, dot_request.dimension_numbers()));
-      break;
-    }
-
-    case OpRequest::kCrossReplicaSumRequest: {
-      const CrossReplicaSumRequest& cross_replica_sum_request =
-          request.request().cross_replica_sum_request();
-      HloInstruction* operand =
-          lookup_instruction(cross_replica_sum_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateCrossReplicaSum(
-          request.output_shape(), {operand}));
-      break;
-    }
-
-    case OpRequest::kInfeedRequest: {
-      const InfeedRequest& infeed_request = request.request().infeed_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateInfeed(
-          request.output_shape(), infeed_request.config()));
-      break;
-    }
-
-    case OpRequest::kOutfeedRequest: {
-      const OutfeedRequest& outfeed_request =
-          request.request().outfeed_request();
-      HloInstruction* operand = lookup_instruction(outfeed_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateOutfeed(
-          outfeed_request.shape(), operand, outfeed_request.outfeed_config()));
-      break;
-    }
-
-    case OpRequest::kMapRequest: {
-      const MapRequest& map_request = request.request().map_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle : map_request.operands()) {
-        HloInstruction* operand = lookup_instruction(handle);
-        operands.push_back(operand);
-      }
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version map_version =
-          request.embedded_computation_versions(0);
-      HloComputation* map_computation =
-          ResolveComputation(map_request.to_apply(), map_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateMap(
-          request.output_shape(), operands, map_computation));
-      break;
-    }
-
-    case OpRequest::kReduceRequest: {
-      const ReduceRequest& reduce_request = request.request().reduce_request();
-      HloInstruction* operand = lookup_instruction(reduce_request.operand());
-      HloInstruction* init_value =
-          lookup_instruction(reduce_request.init_value());
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version reduce_version =
-          request.embedded_computation_versions(0);
-      HloComputation* reduce_computation =
-          ResolveComputation(reduce_request.to_apply(), reduce_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateReduce(
-          request.output_shape(), operand, init_value,
-          AsInt64Slice(reduce_request.dimensions()), reduce_computation));
-      break;
-    }
-
-    case OpRequest::kReduceWindowRequest: {
-      const ReduceWindowRequest& reduce_window_request =
-          request.request().reduce_window_request();
-      HloInstruction* operand =
-          lookup_instruction(reduce_window_request.operand());
-      HloInstruction* init_value =
-          lookup_instruction(reduce_window_request.init_value());
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version reduce_window_version =
-          request.embedded_computation_versions(0);
-      HloComputation* reduce_window_computation = ResolveComputation(
-          reduce_window_request.to_apply(), reduce_window_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateReduceWindow(
-          request.output_shape(), operand, init_value,
-          reduce_window_request.window(), reduce_window_computation));
-      break;
-    }
-
-    case OpRequest::kSelectAndScatterRequest: {
-      const SelectAndScatterRequest& select_and_scatter_request =
-          request.request().select_and_scatter_request();
-      HloInstruction* operand =
-          lookup_instruction(select_and_scatter_request.operand());
-      HloInstruction* source =
-          lookup_instruction(select_and_scatter_request.source());
-      HloInstruction* init_value =
-          lookup_instruction(select_and_scatter_request.init_value());
-      CHECK_EQ(2, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version select_version =
-          request.embedded_computation_versions(0);
-      VersionedComputationHandle::Version scatter_version =
-          request.embedded_computation_versions(1);
-      HloComputation* select_computation = ResolveComputation(
-          select_and_scatter_request.select(), select_version);
-      HloComputation* scatter_computation = ResolveComputation(
-          select_and_scatter_request.scatter(), scatter_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateSelectAndScatter(
-          request.output_shape(), operand, select_computation,
-          select_and_scatter_request.window(), source, init_value,
-          scatter_computation));
-      break;
-    }
-
-    case OpRequest::kBatchNormTrainingRequest: {
-      const BatchNormTrainingRequest& batch_norm_training_request =
-          request.request().batch_norm_training_request();
-      HloInstruction* operand =
-          lookup_instruction(batch_norm_training_request.operand());
-      HloInstruction* scale =
-          lookup_instruction(batch_norm_training_request.scale());
-      HloInstruction* offset =
-          lookup_instruction(batch_norm_training_request.offset());
-
-      hlo_instruction = add_instruction(HloInstruction::CreateBatchNormTraining(
-          request.output_shape(), operand, scale, offset,
-          batch_norm_training_request.epsilon(),
-          batch_norm_training_request.feature_index()));
-      break;
-    }
-
-    case OpRequest::kBatchNormInferenceRequest: {
-      const BatchNormInferenceRequest& batch_norm_inference_request =
-          request.request().batch_norm_inference_request();
-      HloInstruction* operand =
-          lookup_instruction(batch_norm_inference_request.operand());
-      HloInstruction* scale =
-          lookup_instruction(batch_norm_inference_request.scale());
-      HloInstruction* offset =
-          lookup_instruction(batch_norm_inference_request.offset());
-      HloInstruction* mean =
-          lookup_instruction(batch_norm_inference_request.mean());
-      HloInstruction* variance =
-          lookup_instruction(batch_norm_inference_request.variance());
-
-      hlo_instruction =
-          add_instruction(HloInstruction::CreateBatchNormInference(
-              request.output_shape(), operand, scale, offset, mean, variance,
-              batch_norm_inference_request.epsilon(),
-              batch_norm_inference_request.feature_index()));
-      break;
-    }
-
-    case OpRequest::kBatchNormGradRequest: {
-      const BatchNormGradRequest& batch_norm_grad_request =
-          request.request().batch_norm_grad_request();
-
-      HloInstruction* operand =
-          lookup_instruction(batch_norm_grad_request.operand());
-      HloInstruction* scale =
-          lookup_instruction(batch_norm_grad_request.scale());
-      HloInstruction* mean = lookup_instruction(batch_norm_grad_request.mean());
-      HloInstruction* variance =
-          lookup_instruction(batch_norm_grad_request.variance());
-      HloInstruction* grad_output =
-          lookup_instruction(batch_norm_grad_request.grad_output());
-
-      hlo_instruction = add_instruction(HloInstruction::CreateBatchNormGrad(
-          request.output_shape(), operand, scale, mean, variance, grad_output,
-          batch_norm_grad_request.epsilon(),
-          batch_norm_grad_request.feature_index()));
-      break;
-    }
-
-    case OpRequest::kBroadcastRequest: {
-      const BroadcastRequest& broadcast_request =
-          request.request().broadcast_request();
-      HloInstruction* operand = lookup_instruction(broadcast_request.operand());
-      std::vector<int64> broadcast_dimensions;
-      // The client-level broadcast instruction just appends dimensions on the
-      // left (adds lowest numbered dimensions). The HLO broadcast op is more
-      // flexible and can add new dimensions anywhere. The broadcast_dimensions
-      // maps operand dimensions to dimensions in the broadcast output, so
-      // to append dimensions on the left the broadcast_dimensions should just
-      // be the n highest dimension numbers of the output shape where n is
-      // the number of input dimensions.
-      broadcast_dimensions.reserve(ShapeUtil::Rank(operand->shape()));
-      for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
-        broadcast_dimensions.push_back(i +
-                                       ShapeUtil::Rank(request.output_shape()) -
-                                       ShapeUtil::Rank(operand->shape()));
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateBroadcast(
-          request.output_shape(), operand, broadcast_dimensions));
-      break;
-    }
-
-    case OpRequest::kReshapeRequest: {
-      const ReshapeRequest& reshape_request =
-          request.request().reshape_request();
-      HloInstruction* operand = lookup_instruction(reshape_request.operand());
-      HloInstruction* transposed;
-      if (IsIdentityPermutation(AsInt64Slice(reshape_request.dimensions()))) {
-        transposed = operand;
-      } else {
-        transposed = add_instruction(HloInstruction::CreateTranspose(
-            ShapeUtil::PermuteDimensions(
-                InversePermutation(AsInt64Slice(reshape_request.dimensions())),
-                operand->shape()),
-            operand, AsInt64Slice(reshape_request.dimensions())));
-      }
-      hlo_instruction = add_instruction(
-          HloInstruction::CreateReshape(request.output_shape(), transposed));
-      break;
-    }
-
-    case OpRequest::kTransposeRequest: {
-      const TransposeRequest& transpose_request =
-          request.request().transpose_request();
-      HloInstruction* operand = lookup_instruction(transpose_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateTranspose(
-          ShapeUtil::PermuteDimensions(
-              InversePermutation(AsInt64Slice(transpose_request.dimensions())),
-              operand->shape()),
-          operand, AsInt64Slice(transpose_request.dimensions())));
-      break;
-    }
-
-    case OpRequest::kReverseRequest: {
-      const ReverseRequest& reverse_request =
-          request.request().reverse_request();
-      HloInstruction* operand = lookup_instruction(reverse_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateReverse(
-          request.output_shape(), operand,
-          AsInt64Slice(reverse_request.dimensions())));
-      break;
-    }
-
-    case OpRequest::kPadRequest: {
-      const PadRequest& pad_request = request.request().pad_request();
-      HloInstruction* operand = lookup_instruction(pad_request.operand());
-      HloInstruction* padding_value =
-          lookup_instruction(pad_request.padding_value());
-      hlo_instruction = add_instruction(HloInstruction::CreatePad(
-          request.output_shape(), operand, padding_value,
-          pad_request.padding_config()));
-      break;
-    }
-
-    case OpRequest::kRecvRequest: {
-      const RecvRequest& recv_request = request.request().recv_request();
-      HloInstruction* recv = add_instruction(HloInstruction::CreateRecv(
-          request.output_shape(), recv_request.channel_handle().handle()));
-      hlo_instruction = add_instruction(HloInstruction::CreateRecvDone(recv));
-      break;
-    }
-
-    case OpRequest::kParameterRequest: {
-      const ParameterRequest& parameter_request =
-          request.request().parameter_request();
-      hlo_instruction = add_instruction(HloInstruction::CreateParameter(
-          parameter_request.parameter(), request.output_shape(),
-          parameter_request.name()));
-      break;
-    }
-
-    case OpRequest::kConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().convert_request();
-      HloInstruction* operand = lookup_instruction(convert_request.operand());
-      hlo_instruction = add_instruction(
-          HloInstruction::CreateConvert(request.output_shape(), operand));
-      break;
-    }
-
-    case OpRequest::kBitcastConvertRequest: {
-      const ConvertRequest& convert_request =
-          request.request().bitcast_convert_request();
-      HloInstruction* operand = lookup_instruction(convert_request.operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateBitcastConvert(
-          request.output_shape(), operand));
-      break;
-    }
-
-    case OpRequest::kWhileRequest: {
-      const WhileRequest& while_request = request.request().while_request();
-      CHECK_EQ(2, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version condition_version =
-          request.embedded_computation_versions(0);
-      HloComputation* condition =
-          ResolveComputation(while_request.condition(), condition_version);
-      VersionedComputationHandle::Version body_version =
-          request.embedded_computation_versions(1);
-      HloComputation* body =
-          ResolveComputation(while_request.body(), body_version);
-      HloInstruction* init = lookup_instruction(while_request.init());
-      hlo_instruction = add_instruction(HloInstruction::CreateWhile(
-          request.output_shape(), condition, body, init));
-      break;
-    }
-
-    case OpRequest::kConditionalRequest: {
-      const ConditionalRequest& conditional_request =
-          request.request().conditional_request();
-      CHECK_EQ(2, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version true_computation_version =
-          request.embedded_computation_versions(0);
-      HloComputation* true_computation = ResolveComputation(
-          conditional_request.true_computation(), true_computation_version);
-      VersionedComputationHandle::Version false_computation_version =
-          request.embedded_computation_versions(1);
-      HloComputation* false_computation = ResolveComputation(
-          conditional_request.false_computation(), false_computation_version);
-      HloInstruction* predicate =
-          lookup_instruction(conditional_request.predicate());
-      HloInstruction* true_operand =
-          lookup_instruction(conditional_request.true_operand());
-      HloInstruction* false_operand =
-          lookup_instruction(conditional_request.false_operand());
-      hlo_instruction = add_instruction(HloInstruction::CreateConditional(
-          request.output_shape(), predicate, true_operand, true_computation,
-          false_operand, false_computation));
-      break;
-    }
-
-    case OpRequest::kTernaryOpRequest: {
-      const TernaryOpRequest& ternary_op_request =
-          request.request().ternary_op_request();
-      HloInstruction* lhs = lookup_instruction(ternary_op_request.lhs());
-      HloInstruction* rhs = lookup_instruction(ternary_op_request.rhs());
-      HloInstruction* ehs = lookup_instruction(ternary_op_request.ehs());
-      auto hlo_opcode = TernaryOperationToHloOpcode(ternary_op_request.triop());
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast() &&
-          !ShapeUtil::IsTuple(request.output_shape())) {
-        if (!ShapeUtil::IsTuple(lhs->shape()) &&
-            !ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
-          // lhs side is being implicitly broadcast. Change to explicit.
-          lhs =
-              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
-        }
-
-        if (!ShapeUtil::IsTuple(rhs->shape()) &&
-            !ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
-          rhs =
-              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
-        }
-
-        if (!ShapeUtil::IsTuple(ehs->shape()) &&
-            !ShapeUtil::SameDimensions(request.output_shape(), ehs->shape())) {
-          ehs =
-              ImplicitBroadcastToExplicitBroadcast(ehs, request.output_shape());
-        }
-      }
-
-      hlo_instruction = add_instruction(HloInstruction::CreateTernary(
-          request.output_shape(), hlo_opcode, lhs, rhs, ehs));
-      break;
-    }
-
-    case OpRequest::kVariadicOpRequest: {
-      const VariadicOpRequest& variadic_op_request =
-          request.request().variadic_op_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle :
-           variadic_op_request.operands()) {
-        HloInstruction* operand = lookup_instruction(handle);
-        operands.push_back(operand);
-      }
-      auto hlo_opcode =
-          VariadicOperationToHloOpcode(variadic_op_request.varop());
-      hlo_instruction = add_instruction(HloInstruction::CreateVariadic(
-          request.output_shape(), hlo_opcode, operands));
-      break;
-    }
-
-    case OpRequest::kCallRequest: {
-      const CallRequest& call_request = request.request().call_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& handle : call_request.operands()) {
-        operands.push_back(lookup_instruction(handle));
-      }
-      CHECK_EQ(1, request.embedded_computation_versions_size());
-      VersionedComputationHandle::Version call_version =
-          request.embedded_computation_versions(0);
-      HloComputation* call_computation =
-          ResolveComputation(call_request.to_apply(), call_version);
-      hlo_instruction = add_instruction(HloInstruction::CreateCall(
-          request.output_shape(), operands, call_computation));
-      break;
-    }
-
-    case OpRequest::kCustomCallRequest: {
-      const CustomCallRequest& cc_request =
-          request.request().custom_call_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& operand : cc_request.operands()) {
-        operands.push_back(lookup_instruction(operand));
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateCustomCall(
-          cc_request.shape(), operands, cc_request.call_target_name()));
-      break;
-    }
-
-    case OpRequest::kHostComputeRequest: {
-      const HostComputeRequest& host_compute_request =
-          request.request().host_compute_request();
-      std::vector<HloInstruction*> operands;
-      for (const ComputationDataHandle& operand :
-           host_compute_request.operands()) {
-        operands.push_back(lookup_instruction(operand));
-      }
-      auto output_shape = host_compute_request.shape();
-      auto channel_name = host_compute_request.channel_name();
-      auto cost_estimate_ns = host_compute_request.cost_estimate_ns();
-      hlo_instruction = add_instruction(HloInstruction::CreateHostCompute(
-          output_shape, operands, channel_name, cost_estimate_ns));
-      break;
-    }
-
-    case OpRequest::kUnaryOpRequest: {
-      const UnaryOpRequest& unary_op_request =
-          request.request().unary_op_request();
-      HloInstruction* operand = lookup_instruction(unary_op_request.operand());
-      auto hlo_opcode = UnaryOperationToHloOpcode(unary_op_request.unop());
-      hlo_instruction = add_instruction(HloInstruction::CreateUnary(
-          request.output_shape(), hlo_opcode, operand));
-      break;
-    }
-
-    case OpRequest::kBinaryOpRequest: {
-      const BinaryOpRequest& binary_op_request =
-          request.request().binary_op_request();
-      HloInstruction* lhs = lookup_instruction(binary_op_request.lhs());
-      HloInstruction* rhs = lookup_instruction(binary_op_request.rhs());
-      auto hlo_opcode = BinaryOperationToHloOpcode(binary_op_request.binop());
-      if (binary_op_request.broadcast_dimensions_size() > 0 &&
-          ShapeUtil::Rank(lhs->shape()) != ShapeUtil::Rank(rhs->shape())) {
-        // Emit a broadcast instruction to perform the "broadcast in dimension"
-        // operation.
-        HloInstruction* operand_to_broadcast =
-            ShapeUtil::Rank(lhs->shape()) < ShapeUtil::Rank(rhs->shape()) ? lhs
-                                                                          : rhs;
-        CHECK_EQ(ShapeUtil::Rank(operand_to_broadcast->shape()),
-                 binary_op_request.broadcast_dimensions().size());
-
-        // Construct the bounds of the shape of the kBroadcast instruction
-        // responsible for the in-dimension broadcast.
-        std::vector<int64> output_dimensions;
-        for (int64 size : request.output_shape().dimensions()) {
-          output_dimensions.push_back(size);
-        }
-        for (int64 operand_dim = 0;
-             operand_dim < ShapeUtil::Rank(operand_to_broadcast->shape());
-             ++operand_dim) {
-          int64 output_dim =
-              binary_op_request.broadcast_dimensions()[operand_dim];
-          output_dimensions[output_dim] =
-              operand_to_broadcast->shape().dimensions(operand_dim);
-        }
-
-        Shape broadcast_shape = ShapeUtil::MakeShape(
-            operand_to_broadcast->shape().element_type(), output_dimensions);
-
-        // The broadcast semantics of a client-level binary op broadcast is
-        // identical to the HLO broadcast semantics so the broadcast_dimensions
-        // field can just be passed to the instruction builder.
-        HloInstruction* broadcasted_operand =
-            add_instruction(HloInstruction::CreateBroadcast(
-                broadcast_shape, operand_to_broadcast,
-                AsInt64Slice(binary_op_request.broadcast_dimensions())));
-
-        lhs = (lhs == operand_to_broadcast) ? broadcasted_operand : lhs;
-        rhs = (rhs == operand_to_broadcast) ? broadcasted_operand : rhs;
-      }
-      if (debug_options_.xla_eliminate_hlo_implicit_broadcast()) {
-        if (!ShapeUtil::SameDimensions(request.output_shape(), lhs->shape())) {
-          // lhs side is being implicitly broadcast. Change to explicit.
-          lhs =
-              ImplicitBroadcastToExplicitBroadcast(lhs, request.output_shape());
-        }
-
-        if (!ShapeUtil::SameDimensions(request.output_shape(), rhs->shape())) {
-          rhs =
-              ImplicitBroadcastToExplicitBroadcast(rhs, request.output_shape());
-        }
-      }
-      hlo_instruction = add_instruction(HloInstruction::CreateBinary(
-          request.output_shape(), hlo_opcode, lhs, rhs));
-      break;
-    }
-
-    case OpRequest::kReducePrecisionRequest: {
-      const ReducePrecisionRequest& reduce_precision_request =
-          request.request().reduce_precision_request();
-      HloInstruction* operand =
-          lookup_instruction(reduce_precision_request.operand());
-      auto exponent_bits = reduce_precision_request.exponent_bits();
-      auto mantissa_bits = reduce_precision_request.mantissa_bits();
-      hlo_instruction = add_instruction(HloInstruction::CreateReducePrecision(
-          request.output_shape(), operand, exponent_bits, mantissa_bits));
-      break;
-    }
-
-    case OpRequest::kTraceRequest: {
-      const TraceRequest& trace_request = request.request().trace_request();
-      HloInstruction* operand = lookup_instruction(trace_request.operand());
-      hlo_instruction = add_instruction(
-          HloInstruction::CreateTrace(trace_request.tag(), operand));
-      break;
-    }
-
-    case OpRequest::kSendRequest: {
-      const SendRequest& send_request = request.request().send_request();
-      HloInstruction* operand = lookup_instruction(send_request.operand());
-      HloInstruction* send = add_instruction(HloInstruction::CreateSend(
-          operand, send_request.channel_handle().handle()));
-      hlo_instruction = add_instruction(HloInstruction::CreateSendDone(send));
-      break;
-    }
-
-    case OpRequest::kGatherRequest: {
-      const GatherRequest& gather_request = request.request().gather_request();
-      HloInstruction* input_operand =
-          lookup_instruction(gather_request.input());
-      HloInstruction* gather_indices_operand =
-          lookup_instruction(gather_request.gather_indices());
-      std::vector<int64> window_bounds;
-      c_copy(gather_request.window_bounds(), std::back_inserter(window_bounds));
-      hlo_instruction = add_instruction(HloInstruction::CreateGather(
-          request.output_shape(), input_operand, gather_indices_operand,
-          gather_request.dimension_numbers(), window_bounds));
-      break;
-    }
-
-    case OpRequest::OP_NOT_SET:
-      LOG(FATAL) << "OperationRequest doesn't contain a request";
-
-    default:
-      LOG(FATAL) << "Unexpected request type: " << request.request().op_case();
-  }
-  (*instructions)[handle.handle()] = hlo_instruction;
-}  // NOLINT(readability/fn_size)
-
-}  // namespace
-
-StatusOr<std::unique_ptr<HloComputation>> UserComputation::BuildHloComputation(
-    VersionedComputationHandle::Version version,
-    HloComputationResolver hlo_resolver, const DebugOptions& debug_options,
-    bool include_unreachable_instructions) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  VLOG(2) << "Building HloComputation from UserComputation " << name_
-          << " at version " << version;
-  XLA_VLOG_LINES(3, session_computation_.DebugString());
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      ComputationLowerer::Lower(
-          tensorflow::strings::StrCat(name(), ".v", version),
-          session_computation_, version, std::move(hlo_resolver), debug_options,
-          include_unreachable_instructions));
-
-  return std::move(hlo_computation);
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/user_computation.h b/tensorflow/compiler/xla/service/user_computation.h
deleted file mode 100644
index 5544c868fe..0000000000
--- a/tensorflow/compiler/xla/service/user_computation.h
+++ /dev/null
@@ -1,413 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_
-
-#include <functional>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace xla {
-
-// A UserComputation is the built-up computation that users create via the
-// XLA Service interface.
-//
-// The XLA service adds instructions to a user computation via this
-// interface. The state of the computation is stored as a SessionComputation
-// proto which holds a record of all operation-building requests received by the
-// XLA service.
-//
-// UserComputations are lowered to HloComputations which are passed to the high
-// level compiler interface.
-class UserComputation {
- public:
-  // Factory used when restoring a computation from serialized session
-  // computation (computation snapshot) data. Remaps any references to
-  // computation handle via the old_to_new mapping.
-  //
-  // An error will occur if the old_to_new mapping cannot resolve a reference to
-  // a computation that is present in session_computation.
-  static StatusOr<std::unique_ptr<UserComputation>> MakeWithRemapping(
-      const SessionComputation& session_computation,
-      const ComputationHandle& handle,
-      const std::map<int64, ComputationHandle>& old_to_new);
-
-  // Creates an empty computation with the given name and computation handle.
-  explicit UserComputation(const string& name, const ComputationHandle& handle);
-
-  // Enqueues a parameter-retrieving instruction onto this user computation.
-  // Returns an error status if the parameter number is already registered with
-  // different values.
-  StatusOr<ComputationDataHandle> AddParameterInstruction(
-      const ParameterRequest& parameter_request);
-
-  // Enqueues a pad instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddPadInstruction(
-      const PadRequest& pad_request);
-
-  // Enqueues a tracing instruction onto this user computation.
-  // Returns an error status if the operand cannot be resolved.
-  Status AddTraceInstruction(const TraceRequest& trace_request);
-
-  // Enqueues a random number generation instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddRngInstruction(
-      const RngRequest& rng_request);
-
-  // Enqueues a unary instruction onto this user computation.
-  // Returns an error status if the operand index is out of bounds.
-  StatusOr<ComputationDataHandle> AddUnaryInstruction(
-      const UnaryOpRequest& unary_request);
-
-  // Enqueues a batch norm training instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBatchNormTrainingInstruction(
-      const BatchNormTrainingRequest& batch_norm_training_request);
-
-  // Enqueues a batch norm inference instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBatchNormInferenceInstruction(
-      const BatchNormInferenceRequest& batch_norm_inference_request);
-
-  // Enqueues a batch norm grad instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBatchNormGradInstruction(
-      const BatchNormGradRequest& batch_norm_grad_request);
-
-  // Enqueues a binary instruction onto this user computation.
-  // Returns an error status if the operand indices are out of bounds.
-  StatusOr<ComputationDataHandle> AddBinaryInstruction(
-      const BinaryOpRequest& binary_request);
-
-  // Enqueues a ternary instruction onto this user computation.
-  // Returns an error status if the operand indices are out of bounds.
-  StatusOr<ComputationDataHandle> AddTernaryInstruction(
-      const TernaryOpRequest& ternary_request);
-
-  // Enqueues a variadic instruction onto this user computation.
-  // Returns an error status if the operand indices are out of bounds.
-  StatusOr<ComputationDataHandle> AddVariadicInstruction(
-      const VariadicOpRequest& variadic_request);
-
-  // Enqueues a constant instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConstantInstruction(
-      const ConstantRequest& constant_request);
-
-  // Enqueues a get tuple element instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddGetTupleElementInstruction(
-      const GetTupleElementRequest& get_tuple_element_request);
-
-  // Enqueues a map instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddMapInstruction(
-      const MapRequest& map_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a reduce-precision instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReducePrecisionInstruction(
-      const ReducePrecisionRequest& reduce_precision_request);
-
-  // Enqueues a convolution instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConvolveInstruction(
-      const ConvolveRequest& convolve_request);
-
-  // Enqueues an FFT instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddFftInstruction(
-      const FftRequest& fft_request);
-
-  // Enqueues a cross replica sum instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddCrossReplicaSumInstruction(
-      const CrossReplicaSumRequest& cross_replica_sum_request);
-
-  // Enqueues an infeed instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddInfeedInstruction(
-      const InfeedRequest& infeed_request);
-
-  // Enqueues an outfeed instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddOutfeedInstruction(
-      const OutfeedRequest& outfeed_request);
-
-  // Enqueues a host compute instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddHostComputeInstruction(
-      const HostComputeRequest& host_compute_request);
-
-  // Enqueues a call instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddCallInstruction(
-      const CallRequest& call_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a custom call instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddCustomCallInstruction(
-      const CustomCallRequest& custom_call_request);
-
-  // Enqueues a dot instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddDotInstruction(
-      const DotRequest& dot_request);
-
-  // Enqueues a broadcast instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBroadcastInstruction(
-      const BroadcastRequest& broadcast_request);
-
-  // Enqueues a reshape instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReshapeInstruction(
-      const ReshapeRequest& reshape_request);
-
-  // Enqueues a transpose instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddTransposeInstruction(
-      const TransposeRequest& transpose_request);
-
-  // Enqueues a slice instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddSliceInstruction(
-      const SliceRequest& slice_request);
-
-  // Enqueues a dynamic slice instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddDynamicSliceInstruction(
-      const DynamicSliceRequest& dynamic_slice_request);
-
-  // Enqueues a dynamic update slice instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddDynamicUpdateSliceInstruction(
-      const DynamicUpdateSliceRequest& dynamic_update_slice_request);
-
-  // Enqueues a concatenate instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConcatenateInstruction(
-      const ConcatenateRequest& concatenate_request);
-
-  // Enqueues a convert instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddConvertInstruction(
-      const ConvertRequest& convert_request);
-
-  // Enqueues a bitcast element instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddBitcastConvertInstruction(
-      const ConvertRequest& convert_request);
-
-  // Enqueues a reduce instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReduceInstruction(
-      const ReduceRequest& reduce_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a windowed reduce instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReduceWindowInstruction(
-      const ReduceWindowRequest& reduce_window_request,
-      const UserComputation& to_apply_computation);
-
-  // Enqueues a select-and-scatter instruction onto this user
-  // computation.
-  StatusOr<ComputationDataHandle> AddSelectAndScatterInstruction(
-      const SelectAndScatterRequest& select_and_scatter_request,
-      const UserComputation& select_computation,
-      const UserComputation& scatter_computation);
-
-  // Enqueues a reverse instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddReverseInstruction(
-      const ReverseRequest& reverse_request);
-
-  // Enqueues a while instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddWhileInstruction(
-      const WhileRequest& while_request,
-      const UserComputation& condition_computation,
-      const UserComputation& body_computation);
-
-  // Enqueues a conditional instruction on this user computation.
-  StatusOr<ComputationDataHandle> AddConditionalInstruction(
-      const ConditionalRequest& conditional_request,
-      const UserComputation& true_computation,
-      const UserComputation& false_computation);
-
-  // Enqueues a Send instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddSendInstruction(
-      const SendRequest& send_request);
-
-  // Enqueues a Recv instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddRecvInstruction(
-      const RecvRequest& recv_request);
-
-  // Enqueues a Gather instruction onto this user computation.
-  StatusOr<ComputationDataHandle> AddGatherInstruction(
-      const GatherRequest& gather_request);
-
-  // Returns the user-provided name of this user computation, which is provided
-  // via the XLA computation-building API.
-  const string& name() const { return name_; }
-
-  // Subsequent executions of this computation will compute the value
-  // represented by handle, rather than the last expression enqueued
-  // on the computation.
-  Status SetReturnValue(const ComputationDataHandle& handle);
-
-  // Return a versioned handle for this computation.
-  VersionedComputationHandle GetVersionedHandle() const;
-
-  // Return a versioned handle for this computation with a version equal to the
-  // point at which given operation was added to the computation.
-  VersionedComputationHandle GetVersionedHandleAtOperation(
-      const ComputationDataHandle& operation) const;
-
-  // Return a version value representing the current state of the
-  // computation.
-  VersionedComputationHandle::Version version() const;
-
-  // Computes and returns the program shape for the user computation -- gathers
-  // parameters and result type into a single proto. A shared_ptr is used
-  // because the returned pointer refers to an internally cached value which may
-  // be discarded by the UserComputation object. This avoid unnecessary copies.
-  //
-  // If the parameter space is not dense (i.e. there are holes in the parameter
-  // numbers provided) then an error status is returned.
-  StatusOr<std::shared_ptr<const ProgramShape>> ComputeProgramShape(
-      VersionedComputationHandle::Version version) const;
-
-  // Returns true if the given data handle does not depend on any parameter with
-  // index higher then num_parameters. That is, the value can be computed at
-  // compile time if we know the first num_parameters arguments.
-  StatusOr<bool> IsConstant(const ComputationDataHandle& handle,
-                            int64 num_parameters);
-
-  // Returns the output shape of the operation indicated by the given handle.
-  StatusOr<Shape> GetShape(const ComputationDataHandle& handle);
-
-  // Sets metadata on the Hlo instruction referenced by the given handle.
-  Status SetOpMetadata(const ComputationDataHandle& handle,
-                       const OpMetadata& metadata);
-
-  // Sets the device assignment on the Hlo instruction referenced by 'handle'.
-  Status SetOpSharding(const ComputationDataHandle& handle,
-                       const OpSharding& sharding);
-
-  // Builds a HLO computation from the UserComputation. The parameter "resolver"
-  // is a function which returns a pointer to the HloComputation corresponding
-  // to the given ComputationHandle at the given version. The resolver is used
-  // for operations, such as map, which call other computations and need a
-  // pointer to the called HloComputation to construct the respective HLO
-  // instructions. If include_unreachable_instructions is true, then
-  // instructions which are not reachable from the root are lowered into
-  // HloInstructions.
-  using HloComputationResolver =
-      std::function<HloComputation*(const VersionedComputationHandle& handle)>;
-  StatusOr<std::unique_ptr<HloComputation>> BuildHloComputation(
-      VersionedComputationHandle::Version version,
-      HloComputationResolver hlo_resolver, const DebugOptions& debug_options,
-      bool include_unreachable_instructions = true) const;
-
-  // Return a vector containing the embedded computations used by this
-  // UserComputation. Only embedded computations which are called directly by
-  // this UserComputation are included. That is, the transitive closure of
-  // embedded computations is not included.
-  std::vector<VersionedComputationHandle> GetEmbeddedComputations(
-      VersionedComputationHandle::Version version) const;
-
-  // Returns the number of OperationRequest objects in this UserComputation.
-  // The 'version' of a computation is identical to the number of
-  // OperationRequests in the UserComputation.
-  int64 request_count(VersionedComputationHandle::Version version) const {
-    return version;
-  }
-
-  // Returns a copy of the internal session state for this computation -- this
-  // is useful for serializing the guts of a user computation, though references
-  // to other handles (e.g. referred-to computations) must be handled with care
-  // in the serialization / de-serialization process.
-  SessionComputation CloneSessionComputation(
-      VersionedComputationHandle::Version version) const;
-
-  // Warning: typically we don't want to look up computation data handles until
-  // the computation is finished being built, for consistency purposes. We
-  // expose this routine for error reporting purposes so that we can provide
-  // more meaningful error messages from the XLA service layer.
-  //
-  // Returns the operation request that the handle comes from.
-  StatusOr<const OperationRequest*> LookUpRequestForErrorReporting(
-      const ComputationDataHandle& handle) const;
-
-  // Retrieves the parameter metadata for the given parameter number.
-  //
-  // If the parameter number is invalid for this computation, nullopt is
-  // returned. When the return value has_value(), nullptr will never be
-  // the held value.
-  tensorflow::gtl::optional<const OpMetadata*> ParameterMetadata(
-      int parameter_number) const;
-
- private:
-  // Warning: dangerous mutating operation that doesn't respect versioning.
-  // This is only used at initialization time when constructing from a
-  // SessionComputation a la MakeWithRemapping.
-  //
-  // Remaps references to old computations (with handle values in the keys of
-  // old_to_new) to the computation handle given in the values. This is useful
-  // when loading computations from snapshots, to finish initialization, before
-  // the user computation is released into the wild.
-  Status RemapEmbeddedComputations(
-      const std::map<int64, ComputationHandle>& old_to_new)
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Returns the OperationRequest corresponding to the given handle.
-  StatusOr<const OperationRequest*> LookUpRequest(
-      const ComputationDataHandle& handle) const
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Creates a new ComputationDataHandle with the next available handle value.
-  ComputationDataHandle CreateComputationDataHandle()
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Checks whether the parameter numbers of the parameter operations are
-  // contiguous starting from zero. Returns appropriate error status if not.
-  Status CheckParametersAreContiguous(
-      VersionedComputationHandle::Version version) const
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  VersionedComputationHandle GetVersionedHandleInternal() const
-      EXCLUSIVE_LOCKS_REQUIRED(mutex_);
-
-  // Name of the computation.
-  string name_;
-
-  mutable tensorflow::mutex mutex_;
-
-  // State of the computation as a record of all operation-building requests.
-  SessionComputation session_computation_ GUARDED_BY(mutex_);
-
-  // Mapping from parameter number to operation request containing the
-  // respective ParameterRequest.
-  std::map<int64, OperationRequest*> parameters_ GUARDED_BY(mutex_);
-
-  // The next ComputationDataHandle value to assign. Handle values are assigned
-  // sequentially.
-  int64 next_handle_value_ GUARDED_BY(mutex_);
-
-  // If handle_to_return_.has_handle() then an Execution of this Computation
-  // will compute the value represented by handle_to_return_, otherwise it will
-  // compute the value of (next_handle_value_ - 1).
-  ComputationDataHandle handle_to_return_ GUARDED_BY(mutex_);
-
-  // Memoized ProgramShape and its version. A shared_ptr is used because
-  // references to this object are returned by ComputeProgramShape.
-  mutable int64 program_shape_version_ GUARDED_BY(mutex_) = 0;
-  mutable std::shared_ptr<const ProgramShape> program_shape_ GUARDED_BY(mutex_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(UserComputation);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_USER_COMPUTATION_H_
diff --git a/tensorflow/compiler/xla/service/user_computation_test.cc b/tensorflow/compiler/xla/service/user_computation_test.cc
deleted file mode 100644
index 2fa163953f..0000000000
--- a/tensorflow/compiler/xla/service/user_computation_test.cc
+++ /dev/null
@@ -1,340 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/user_computation.h"
-
-#include "tensorflow/compiler/xla/literal_util.h"
-#include "tensorflow/compiler/xla/service/hlo_computation.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-
-namespace op = xla::testing::opcode_matchers;
-
-namespace xla {
-namespace {
-
-using UserComputationTest = ::testing::Test;
-
-TEST_F(UserComputationTest, SimpleComputation) {
-  const Shape kScalarShape = ShapeUtil::MakeShape(F32, {});
-  const Shape kVectorShape = ShapeUtil::MakeShape(F32, {2});
-
-  // Build a simple three operation computatation:
-  //
-  //   %constant = Constant({123, 42})
-  //   %param = Param(0)
-  //   %outfeed = Outfeed(%constant)
-  //
-  // Build the computation at two different versions and check invariants.
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ConstantRequest constant_request;
-  *constant_request.mutable_literal() =
-      Literal::CreateR1<float>({123.0f, 42.0f})->ToProto();
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle constant_handle,
-                          computation.AddConstantInstruction(constant_request));
-
-  ParameterRequest param_request;
-  *param_request.mutable_shape() = kScalarShape;
-  param_request.set_parameter(0);
-  param_request.set_name("param0");
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle param_handle,
-                          computation.AddParameterInstruction(param_request));
-  OpMetadata metadata;
-  metadata.set_op_name("meta");
-  TF_ASSERT_OK(computation.SetOpMetadata(param_handle, metadata));
-
-  OutfeedRequest outfeed_request;
-  *outfeed_request.mutable_operand() = constant_handle;
-  *outfeed_request.mutable_shape() = kVectorShape;
-  outfeed_request.set_outfeed_config("abc");
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle outfeed_handle,
-                          computation.AddOutfeedInstruction(outfeed_request));
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  {
-    // Test the computation at the latest version. In this case, the most
-    // recently added operation is an outfeed. However, the outfeed is not the
-    // root because outfeeds cannot be the root of a computation.
-    VersionedComputationHandle latest_version =
-        computation.GetVersionedHandle();
-
-    // Program shape should have a single scalar parameter and scalar
-    // result. The outfeed instruction should not affect the program shape.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        computation.ComputeProgramShape(latest_version.version));
-    ASSERT_EQ(1, program_shape->parameters_size());
-    EXPECT_TRUE(
-        ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0)));
-    EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
-
-    // Build the HLO computation.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                        DebugOptions()));
-    // There should be one HloInstruction per UserComputation operation.
-    EXPECT_EQ(3, hlo_computation->instruction_count());
-    // The root of the instruction should be the parameter instruction (not the
-    // outfeed).
-    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
-  }
-
-  {
-    // Test the computation at the version right after the parameter instruction
-    // is added.
-    VersionedComputationHandle version_at_param =
-        computation.GetVersionedHandleAtOperation(param_handle);
-
-    // Program shape should have a single scalar parameter, and scalar result.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::shared_ptr<const ProgramShape> program_shape,
-        computation.ComputeProgramShape(version_at_param.version));
-    ASSERT_EQ(1, program_shape->parameters_size());
-    EXPECT_TRUE(
-        ShapeUtil::Compatible(kScalarShape, program_shape->parameters(0)));
-    EXPECT_TRUE(ShapeUtil::Compatible(kScalarShape, program_shape->result()));
-
-    // There should be two instructions, one for the constant and one for the
-    // parameter. The outfeed instruction should not be included.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(version_at_param.version, hlo_resolver,
-                                        DebugOptions()));
-    EXPECT_EQ(2, hlo_computation->instruction_count());
-    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
-  }
-  {
-    // Test the computation at the latest version, but lowered with
-    // include_unreachable_instructions set to false.
-    VersionedComputationHandle latest_version =
-        computation.GetVersionedHandle();
-
-    // Build the HLO computation.
-    TF_ASSERT_OK_AND_ASSIGN(
-        std::unique_ptr<HloComputation> hlo_computation,
-        computation.BuildHloComputation(
-            latest_version.version, hlo_resolver, DebugOptions(),
-            /*include_unreachable_instructions=*/false));
-    // There is only one reachable instruction, the parameter.
-    EXPECT_EQ(1, hlo_computation->instruction_count());
-    // The root of the instruction should be the parameter instruction (not the
-    // outfeed).
-    EXPECT_THAT(hlo_computation->root_instruction(), op::Parameter());
-    EXPECT_EQ(hlo_computation->root_instruction()->metadata().op_name(),
-              "meta");
-  }
-}
-
-TEST_F(UserComputationTest, EliminateScalarBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  // Build a binary computation with scalar broadcast.
-  //
-  //  %a = Constant({123, 42})
-  //  %b = Constant(1)
-  //  %add = Add(%a, %b)
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ConstantRequest a_request;
-  *a_request.mutable_literal() =
-      Literal::CreateR1<float>({123.0f, 42.0f})->ToProto();
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddConstantInstruction(a_request));
-
-  ConstantRequest b_request;
-  *b_request.mutable_literal() = Literal::CreateR0<float>(1.0f)->ToProto();
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddConstantInstruction(b_request));
-
-  BinaryOpRequest add;
-  add.set_binop(BINOP_ADD);
-  *add.mutable_lhs() = a_handle;
-  *add.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-  // The binary operation has implicit scalar broadcast, should be converted
-  // to an explicit broadcast intruction and a binary instruction.
-  EXPECT_EQ(4, hlo_computation->instruction_count());
-  EXPECT_THAT(hlo_computation->root_instruction(), op::Add());
-  LOG(INFO) << hlo_computation->root_instruction()->ToString();
-  const auto& operands = hlo_computation->root_instruction()->operands();
-  ASSERT_EQ(2, operands.size());
-  EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast ||
-              operands[1]->opcode() == HloOpcode::kBroadcast);
-}
-
-TEST_F(UserComputationTest, CheckImplicitBroadcastToExplicitBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  // Build a binary computation with degenerate broadcast.
-  //
-  //  %a = Param({1, 2, 3});
-  //  %b = Param({1, 2, 1});
-  //  %add = Add(%a, %b, {});
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 2, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {1, 2, 1});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  const int64 kDevice = 7;
-  OpSharding sharding;
-  sharding.set_type(OpSharding::Type::OpSharding_Type_MAXIMAL);
-  sharding.add_tile_assignment_dimensions(1);
-  sharding.add_tile_assignment_devices(kDevice);
-
-  TF_EXPECT_OK(computation.SetOpSharding(b_handle, sharding));
-
-  BinaryOpRequest add;
-  add.set_binop(BINOP_ADD);
-  *add.mutable_lhs() = a_handle;
-  *add.mutable_rhs() = b_handle;
-  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  //    b         a
-  //    |         |
-  // reshape      |
-  //    |         |
-  // broadcast    |
-  //     \       /
-  //        add
-  EXPECT_EQ(5, hlo_computation->instruction_count());
-  ASSERT_THAT(
-      hlo_computation->root_instruction(),
-      op::Add(op::Parameter(), op::Broadcast(op::Reshape(op::Parameter()))));
-
-  const HloInstruction* broadcast =
-      hlo_computation->root_instruction()->operand(1);
-  EXPECT_TRUE(broadcast->has_sharding());
-
-  const HloInstruction* reshape = broadcast->operand(0);
-  EXPECT_TRUE(reshape->has_sharding());
-}
-
-TEST_F(UserComputationTest, EliminateDegenerateBroadcastAfterIndimBroadcast) {
-  auto debug_options = DebugOptions();
-  debug_options.set_xla_eliminate_hlo_implicit_broadcast(true);
-
-  // Build a binary computation with in-dim broadcast and degenerate broadcast.
-  //
-  //  %a = Param({2, 3});
-  //  %b = Param({2, 1, 4});
-  //  %add = Add(%a, %b, {0, 1});
-  ComputationHandle handle;
-  handle.set_handle(123);
-  UserComputation computation("TheComputation", handle);
-
-  ParameterRequest a_request;
-  *a_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 3});
-  a_request.set_name("a");
-  a_request.set_parameter(0);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle a_handle,
-                          computation.AddParameterInstruction(a_request));
-
-  ParameterRequest b_request;
-  *b_request.mutable_shape() = ShapeUtil::MakeShape(F32, {2, 1, 4});
-  b_request.set_name("b");
-  b_request.set_parameter(1);
-  TF_ASSERT_OK_AND_ASSIGN(ComputationDataHandle b_handle,
-                          computation.AddParameterInstruction(b_request));
-
-  BinaryOpRequest add;
-  add.set_binop(BINOP_ADD);
-  *add.mutable_lhs() = a_handle;
-  *add.mutable_rhs() = b_handle;
-  add.add_broadcast_dimensions(0);
-  add.add_broadcast_dimensions(1);
-  TF_ASSERT_OK(computation.AddBinaryInstruction(add).status());
-
-  auto hlo_resolver = [](const VersionedComputationHandle& handle) {
-    return nullptr;
-  };
-  VersionedComputationHandle latest_version = computation.GetVersionedHandle();
-
-  // Build the HLO computation.
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<HloComputation> hlo_computation,
-      computation.BuildHloComputation(latest_version.version, hlo_resolver,
-                                      debug_options));
-
-  // The binary operation has in-dim broadcast and degenerate broadcast, should
-  // first do the in-dim broadcast then convert the degnerate broadcast into a
-  // reshape and a broadcast.
-  //
-  //    b         a
-  //    |         |
-  // broadcast reshape
-  //    |         |
-  //    |     broadcast
-  //     \        /
-  //        add
-  EXPECT_EQ(6, hlo_computation->instruction_count());
-  EXPECT_THAT(hlo_computation->root_instruction(), op::Add());
-  const auto& operands = hlo_computation->root_instruction()->operands();
-  ASSERT_EQ(2, operands.size());
-  EXPECT_TRUE(operands[0]->opcode() == HloOpcode::kBroadcast &&
-              operands[1]->opcode() == HloOpcode::kBroadcast);
-}
-
-}  // namespace
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 15b9cd4265..d73bcdaf82 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -164,7 +164,6 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service",
-        "//tensorflow/compiler/xla/service:computation_tracker",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:interpreter_plugin",
         "//tensorflow/core:lib",
diff --git a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
index b815bbf854..5dd5150be3 100644
--- a/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
+++ b/tensorflow/compiler/xla/tools/dumped_computation_to_text.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
-#include "tensorflow/compiler/xla/service/computation_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 #include "tensorflow/compiler/xla/statusor.h"
-- 
GitLab


From 10fa513e15691681903a472d251fa8eadca1f239 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 31 May 2018 11:43:37 -0700
Subject: [PATCH 106/610] [XLA] Make HloInstruction::backend_config() a
 JSON-encoded protobuf.

PiperOrigin-RevId: 198754463
---
 tensorflow/compiler/xla/BUILD                 |  31 ---
 tensorflow/compiler/xla/scanner.cc            | 197 ------------------
 tensorflow/compiler/xla/scanner.h             | 102 ---------
 tensorflow/compiler/xla/scanner_test.cc       | 124 -----------
 tensorflow/compiler/xla/service/BUILD         |   1 +
 tensorflow/compiler/xla/service/compiler.cc   |   5 +-
 tensorflow/compiler/xla/service/compiler.h    |   6 +-
 .../compiler/xla/service/hlo_graph_dumper.cc  |   4 +-
 .../compiler/xla/service/hlo_instruction.cc   |  36 +++-
 .../compiler/xla/service/hlo_instruction.h    |  36 +++-
 .../compiler/xla/tools/parser/hlo_parser.cc   |   2 +-
 .../xla/tools/parser/hlo_parser_test.cc       |   2 +-
 tensorflow/core/BUILD                         |  52 +++--
 .../core/platform/default/build_config.bzl    |   3 +
 .../platform/default/human_readable_json.cc   |  54 +++++
 .../core/platform/human_readable_json.h       |  37 ++++
 16 files changed, 202 insertions(+), 490 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/scanner.cc
 delete mode 100644 tensorflow/compiler/xla/scanner.h
 delete mode 100644 tensorflow/compiler/xla/scanner_test.cc
 create mode 100644 tensorflow/core/platform/default/human_readable_json.cc
 create mode 100644 tensorflow/core/platform/human_readable_json.h

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index c08db7e3fb..c6deb959a5 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -499,37 +499,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "scanner",
-    srcs = ["scanner.cc"],
-    hdrs = ["scanner.h"],
-    visibility = [":internal"],
-    deps = [
-        ":status",
-        ":status_macros",
-        ":types",
-        ":util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "scanner_test",
-    srcs = ["scanner_test.cc"],
-    deps = [
-        ":scanner",
-        ":status",
-        ":status_macros",
-        ":test",
-        ":types",
-        ":util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:test_main",
-    ],
-)
-
 cc_library(
     name = "text_literal_reader",
     srcs = ["text_literal_reader.cc"],
diff --git a/tensorflow/compiler/xla/scanner.cc b/tensorflow/compiler/xla/scanner.cc
deleted file mode 100644
index f23a1417fc..0000000000
--- a/tensorflow/compiler/xla/scanner.cc
+++ /dev/null
@@ -1,197 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/scanner.h"
-
-#include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace xla {
-namespace {
-
-// Returns true if c can be the first character in an identifier.
-bool IsIdentifierFirst(int c) { return std::isalpha(c) || c == '_'; }
-
-// Returns true if c can be the non-first character in an identifier.
-bool IsIdentifierLater(int c) { return std::isalnum(c) || c == '_'; }
-
-// Returns true if str is an identifier.
-bool IsIdentifier(tensorflow::StringPiece str) {
-  if (str.empty() || !IsIdentifierFirst(str[0])) {
-    return false;
-  }
-  for (int64 i = 1; i < str.size(); ++i) {
-    if (!IsIdentifierLater(str[i])) {
-      return false;
-    }
-  }
-  return true;
-}
-
-}  // namespace
-
-Scanner::Scanner(tensorflow::StringPiece input) : input_(input), position_(0) {}
-
-bool Scanner::ok() const { return status().ok(); }
-
-const Status& Scanner::status() const { return status_; }
-
-bool Scanner::Match(tensorflow::StringPiece match) {
-  SkipWhitespace();
-  if (ok() && position_ + match.size() <= input_.size() &&
-      std::equal(match.begin(), match.end(), input_.begin() + position_)) {
-    SkipChars(match.size());
-
-    VLOG(10) << "Matched \"" << match << "\"";
-    return true;
-  } else {
-    return false;
-  }
-}
-
-void Scanner::Expect(tensorflow::StringPiece expect) {
-  if (!Match(expect)) {
-    SetError(tensorflow::strings::StrCat("Expected \"", expect, "\"."));
-  }
-}
-
-bool Scanner::MatchReadIdentifier(string* identifier) {
-  SkipWhitespace();
-  if (!IsIdentifierFirst(PeekChar())) {
-    return false;
-  }
-  identifier->clear();
-  do {
-    *identifier += ReadChar();
-  } while (IsIdentifierLater(PeekChar()));
-
-  VLOG(10) << "Read identifier " << identifier;
-  CHECK(IsIdentifier(*identifier));
-  return true;
-}
-
-string Scanner::ReadIdentifier() {
-  string identifier;
-  if (!MatchReadIdentifier(&identifier)) {
-    SetError("Expected identifier.");
-  }
-  return identifier;
-}
-
-void Scanner::ExpectIdentifier(tensorflow::StringPiece expect) {
-  CHECK(IsIdentifier(expect));
-
-  string identifier;
-  if (!MatchReadIdentifier(&identifier)) {
-    SetError(tensorflow::strings::StrCat("Expected identifier ", expect, "."));
-  }
-  if (identifier != expect) {
-    SetError(tensorflow::strings::StrCat("Expected identifier ", expect,
-                                         ", but got ", identifier, "."));
-  }
-}
-
-// Matches the end of the input, also known as End Of File (EOF).
-bool Scanner::MatchEof() {
-  SkipWhitespace();
-  return PeekChar() == EOF;
-}
-
-void Scanner::ExpectEof() {
-  if (!MatchEof()) {
-    SetError("Expected end of input.");
-  }
-}
-
-// Reads a vector of the format "(1, 2, 3)".
-std::vector<int64> Scanner::ReadIntVector() {
-  std::vector<int64> ints;
-  Expect("(");
-  if (!Match(")") && ok()) {
-    ints.push_back(ReadInt());
-    while (Match(",")) {
-      ints.push_back(ReadInt());
-    }
-    Expect(")");
-  }
-
-  VLOG(10) << "Read int vector with " << ints.size() << " elements.";
-  return ints;
-}
-
-int64 Scanner::ReadInt() {
-  bool negative = Match("-");
-  if (!PeekDigit()) {
-    SetError("Expected integer.");
-    return 0;
-  }
-
-  int64 integer = 0;
-  do {
-    integer = (ReadChar() - '0') + integer * 10;
-  } while (PeekDigit());
-  integer = negative ? -integer : integer;
-
-  VLOG(10) << "Read integer " << integer;
-  return integer;
-}
-
-void Scanner::SkipWhitespace() {
-  while (PeekWhitespace()) {
-    SkipChars(1);
-  }
-}
-
-int Scanner::ReadChar() {
-  int c = PeekChar();
-  SkipChars(1);
-
-  VLOG(20) << "Read char " << c;
-  return c;
-}
-
-int Scanner::PeekChar() const {
-  return ok() && position_ < input_.size() ? input_[position_] : EOF;
-}
-
-bool Scanner::PeekDigit() const {
-  // Do not use std::isdigit since it depends on the locale and we do not
-  // handle any digits beyond 0-9.
-  const char c = PeekChar();
-  return '0' <= c && c <= '9';
-}
-
-bool Scanner::PeekAlnum() const { return std::isalnum(PeekChar()); }
-
-bool Scanner::PeekWhitespace() const { return std::isspace(PeekChar()); }
-
-void Scanner::SkipChars(int64 count) {
-  CHECK_GE(count, 0);
-  position_ += count;
-}
-
-void Scanner::SetError(string error_message) {
-  // Only the first error is recorded since any later errors will likely be a
-  // consequence of the first error.
-  if (ok()) {
-    status_ = InvalidArgumentStrCat(std::move(error_message));
-    position_ = input_.size();
-    VLOG(10) << "Failed scanner with error " << status_.ToString();
-  } else {
-    VLOG(10) << "Error on already failed scanner is " << error_message;
-  }
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/scanner.h b/tensorflow/compiler/xla/scanner.h
deleted file mode 100644
index 86b04ae7f9..0000000000
--- a/tensorflow/compiler/xla/scanner.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SCANNER_H_
-#define TENSORFLOW_COMPILER_XLA_SCANNER_H_
-
-#include "tensorflow/compiler/xla/status.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/stringpiece.h"
-
-namespace xla {
-
-// Simple class for parsing data. The concepts for the interface are:
-//
-//   Match(x): Returns true if x is next in the input and in that case skips
-//     past it. Otherwise returns false.
-//
-//   Expect(x): As Match(x), but requires x to be next in the input.
-//
-//   MatchReadX(x): Returns true if an X is next in the input and in that case
-//     skips past it and assigns it to x. Otherwise returns false.
-//
-//   ReadX(): As ReadMatchX(), but requires an X to be next in the input and
-//     returns it.
-//
-//   PeekX(): Returns true if an X is next in the input and does not skip
-//     past it either way.
-//
-// All of these, except those that work on individual characters, skip
-// whitespace.
-//
-// If a requirement is not met, the error is available in status(). A Scanner
-// with a failed status() will behave as though the rest of the input is EOF and
-// will not record further errors after that point.
-class Scanner {
- public:
-  Scanner(tensorflow::StringPiece input);
-
-  bool ok() const;
-  const Status& status() const;
-
-  bool Match(tensorflow::StringPiece match);
-  void Expect(tensorflow::StringPiece expect);
-
-  // Match-reads an identifier. An identifier starts with an alphabetic
-  // character or an underscore followed by any number of characters that are
-  // each alphanumeric or underscore.
-  bool MatchReadIdentifier(string* identifier);
-
-  string ReadIdentifier();
-
-  void ExpectIdentifier(tensorflow::StringPiece expect);
-
-  // Matches the end of the input, also known as End Of File (EOF).
-  bool MatchEof();
-  void ExpectEof();
-
-  // Reads a vector of the format "(1, 4, 5)".
-  std::vector<int64> ReadIntVector();
-
-  // Reads an integer. Can start with a minus but not a plus.
-  int64 ReadInt();
-
-  // Keeps skipping until encountering a non-whitespace character.
-  void SkipWhitespace();
-
-  // *** Below here are character-level methods that do not skip whitespace.
-
-  int ReadChar();
-  int PeekChar() const;
-  bool PeekDigit() const;
-  bool PeekAlnum() const;
-  bool PeekWhitespace() const;
-
-  // Skip past the next count characters.
-  void SkipChars(int64 count);
-
- private:
-  // Sets a failed status. The input is in effect replaced with EOF after
-  // this. Only the first error is recorded.
-  void SetError(string error_message);
-
-  const tensorflow::StringPiece input_;
-  int64 position_;
-  Status status_;
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SCANNER_H_
diff --git a/tensorflow/compiler/xla/scanner_test.cc b/tensorflow/compiler/xla/scanner_test.cc
deleted file mode 100644
index 10cd0c6a04..0000000000
--- a/tensorflow/compiler/xla/scanner_test.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// TODO(b/80179519): Fix open source build for real.
-#if 0
-#include "tensorflow/compiler/xla/scanner.h"
-
-#include <string>
-
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/env.h"
-
-namespace xla {
-namespace {
-
-TEST(Scanner, Empty) {
-  Scanner scanner("");
-
-  EXPECT_EQ(scanner.PeekChar(), EOF);
-  EXPECT_TRUE(scanner.MatchEof());
-  EXPECT_TRUE(scanner.Match(""));
-  EXPECT_FALSE(scanner.Match("1"));
-  EXPECT_TRUE(scanner.ok());
-}
-
-TEST(Scanner, Prefix) {
-  Scanner scanner("1234 5");
-  EXPECT_FALSE(scanner.MatchEof());
-  EXPECT_TRUE(scanner.Match("12"));
-  EXPECT_TRUE(scanner.Match("34 "));
-  EXPECT_FALSE(scanner.MatchEof());
-  EXPECT_FALSE(scanner.Match("5 "));
-  EXPECT_TRUE(scanner.Match("5"));
-  EXPECT_TRUE(scanner.MatchEof());
-}
-
-TEST(Scanner, Whitespace) {
-  Scanner scanner(" \t\n\r 1\t2\n\n");
-
-  EXPECT_FALSE(scanner.Match(" "));
-  EXPECT_TRUE(scanner.Match("1"));
-  EXPECT_TRUE(scanner.Match("2"));
-  EXPECT_TRUE(scanner.MatchEof());
-  EXPECT_TRUE(scanner.ok());
-}
-
-TEST(Scanner, Fail) {
-  Scanner scanner("153 4q");
-
-  scanner.Expect("5");
-  EXPECT_FALSE(scanner.ok());
-  EXPECT_FALSE(scanner.status().ok());
-
-  EXPECT_TRUE(scanner.MatchEof());
-}
-
-TEST(Scanner, Identifier) {
-  Scanner scanner("1 q1  _1_ _1a= qqb");
-
-  string identifier = "foo";
-  EXPECT_FALSE(scanner.MatchReadIdentifier(&identifier));
-  EXPECT_EQ(identifier, "foo");
-  scanner.Match("1");
-
-  EXPECT_TRUE(scanner.MatchReadIdentifier(&identifier));
-  EXPECT_EQ(identifier, "q1");
-
-  scanner.ExpectIdentifier("_1_");
-  EXPECT_TRUE(scanner.ok());
-
-  scanner.ExpectIdentifier("_1a");
-  EXPECT_TRUE(scanner.ok());
-
-  // The = after _1a is not included in the identifier.
-  scanner.Expect("=");
-
-  // The expected identifier matches a prefix but is not the full identifier in
-  // the input.
-  EXPECT_TRUE(scanner.ok());
-  scanner.ExpectIdentifier("qq");
-  EXPECT_FALSE(scanner.ok());
-}
-
-TEST(Scanner, Int) {
-  Scanner scanner("1_2 3% -1 124345 -363 0 -0");
-  EXPECT_EQ(1, scanner.ReadInt());
-  EXPECT_TRUE(scanner.Match("_"));
-  EXPECT_EQ(2, scanner.ReadInt());
-  EXPECT_EQ(3, scanner.ReadInt());
-  EXPECT_TRUE(scanner.Match("%"));
-  EXPECT_EQ(-1, scanner.ReadInt());
-  EXPECT_EQ(124345, scanner.ReadInt());
-  EXPECT_EQ(-363, scanner.ReadInt());
-  EXPECT_EQ(0, scanner.ReadInt());
-  EXPECT_EQ(0, scanner.ReadInt());
-  EXPECT_TRUE(scanner.MatchEof());
-}
-
-TEST(Scanner, IntVector) {
-  Scanner scanner("()(0) (-1,2) ( 3 , 4 )");
-  EXPECT_THAT(scanner.ReadIntVector(), testing::IsEmpty());
-  EXPECT_THAT(scanner.ReadIntVector(), testing::ElementsAre(0));
-  EXPECT_THAT(scanner.ReadIntVector(), testing::ElementsAre(-1, 2));
-  EXPECT_THAT(scanner.ReadIntVector(), testing::ElementsAre(3, 4));
-  EXPECT_TRUE(scanner.MatchEof());
-  EXPECT_TRUE(scanner.ok());
-}
-
-}  // namespace
-}  // namespace xla
-#endif
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index b954bbd20a..aa416312ad 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -309,6 +309,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:human_readable_json",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
     ],
diff --git a/tensorflow/compiler/xla/service/compiler.cc b/tensorflow/compiler/xla/service/compiler.cc
index 31f84e88f8..6f06bba679 100644
--- a/tensorflow/compiler/xla/service/compiler.cc
+++ b/tensorflow/compiler/xla/service/compiler.cc
@@ -28,8 +28,9 @@ namespace xla {
 /* static */ tensorflow::mutex Compiler::platform_compiler_mutex_(
     tensorflow::LINKER_INITIALIZED);
 
-std::vector<string> Compiler::ComputeBackendConfigs(
-    const HloInstruction& hlo, se::StreamExecutor* executor) const {
+std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
+Compiler::ComputeBackendConfigs(const HloInstruction& hlo,
+                                se::StreamExecutor* executor) const {
   CHECK(executor != nullptr);
   return {};
 }
diff --git a/tensorflow/compiler/xla/service/compiler.h b/tensorflow/compiler/xla/service/compiler.h
index c39db58b78..6c52ffd800 100644
--- a/tensorflow/compiler/xla/service/compiler.h
+++ b/tensorflow/compiler/xla/service/compiler.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/stream_executor_no_cuda.h"
 #include "tensorflow/core/platform/thread_annotations.h"
 
@@ -161,8 +162,9 @@ class Compiler {
   //
   // The stream executor is passed in to provide information about the hardware
   // that the backend configurations would be targeting.
-  virtual std::vector<string> ComputeBackendConfigs(
-      const HloInstruction& hlo, se::StreamExecutor* executor) const;
+  virtual std::vector<std::unique_ptr<tensorflow::protobuf::Message>>
+  ComputeBackendConfigs(const HloInstruction& hlo,
+                        se::StreamExecutor* executor) const;
 
   // Compiles the HLO module for ahead-of-time execution.  This is intended for
   // use in static compilation.
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 672b1c017a..05adb45713 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -1085,11 +1085,11 @@ string HloDotDumper::GetInstructionNodeMetadata(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeBackendConfig(
     const HloInstruction* instr) {
-  if (!show_backend_config_ || instr->backend_config().empty()) {
+  if (!show_backend_config_ || instr->raw_backend_config_string().empty()) {
     return "";
   }
 
-  return StrCat("backend_config=\"", instr->backend_config(), "\"");
+  return StrCat("backend_config=\"", instr->raw_backend_config_string(), "\"");
 }
 
 string HloDotDumper::GetInstructionNodeExtraInfo(const HloInstruction* instr) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index c55e5cf793..a68075ef20 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/human_readable_json.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace xla {
@@ -110,7 +111,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->name_ = proto.name();
 
   instruction->metadata_ = proto.metadata();
-  instruction->set_backend_config(proto.backend_config());
+  instruction->backend_config_ = proto.backend_config();
   if (proto.has_literal()) {
     TF_ASSIGN_OR_RETURN(instruction->literal_,
                         Literal::CreateFromProto(proto.literal()));
@@ -1521,7 +1522,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   }
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
-  clone->set_backend_config(backend_config());
+  clone->set_raw_backend_config_string(backend_config_);
   if (context != nullptr) {
     context->MapInstruction(this, clone.get());
     clone->ReplaceCalledComputations([&](HloComputation* callee) {
@@ -2182,8 +2183,8 @@ string HloInstruction::ToStringWithCanonicalNameMap(
        !metadata_.source_file().empty())) {
     StrAppend(&result, ", metadata={", xla::OpMetadataToString(metadata_), "}");
   }
-  if (options.print_backend_config() && !backend_config().empty()) {
-    StrAppend(&result, ", backend_config=\"", CEscape(backend_config()), "\"");
+  if (options.print_backend_config() && !backend_config_.empty()) {
+    StrAppend(&result, ", backend_config=\"", CEscape(backend_config_), "\"");
   }
   return result;
 }
@@ -2463,7 +2464,7 @@ HloInstructionProto HloInstruction::ToProto() const {
   }
 
   *proto.mutable_metadata() = metadata_;
-  proto.set_backend_config(backend_config());
+  proto.set_backend_config(backend_config_);
   if (literal_ != nullptr) {
     *proto.mutable_literal() = literal_->ToProto();
   }
@@ -3526,6 +3527,31 @@ bool HloInstruction::CouldBeBitcast() const {
   }
 }
 
+Status HloInstruction::GetBackendConfigInternal(
+    tensorflow::protobuf::Message* proto) const {
+  proto->Clear();
+
+  // Empty string does not parse as valid JSON, but it's a valid backend config,
+  // corresponding to the empty proto.
+  if (backend_config_.empty()) {
+    return Status::OK();
+  }
+  return tensorflow::HumanReadableJsonToProto(backend_config_, proto);
+}
+
+Status HloInstruction::set_backend_config(
+    const tensorflow::protobuf::Message& proto) {
+  TF_ASSIGN_OR_RETURN(backend_config_, BackendConfigToRawString(proto));
+  return Status::OK();
+}
+
+/* static */ StatusOr<string> HloInstruction::BackendConfigToRawString(
+    const tensorflow::protobuf::Message& proto) {
+  string ret;
+  TF_RETURN_IF_ERROR(tensorflow::ProtoToHumanReadableJson(proto, &ret));
+  return ret;
+}
+
 HloModule* HloInstruction::GetModule() const {
   if (parent_) {
     return parent_->parent();
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 8119c35066..72b9d545ae 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/iterator_range.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -1446,12 +1447,33 @@ class HloInstruction {
   // this field and they cannot interpret it due to its meaning being backend
   // specific.
   //
-  // TODO(b/78194644): Introduce structured configuration format as per
-  // go/xla-heuristics.
-  const string& backend_config() const { return backend_config_; }
-  void set_backend_config(string backend_config) {
-    backend_config_ = std::move(backend_config);
+  // ConfigProto should be a protobuf Message type.
+  template <typename ConfigProto>
+  StatusOr<ConfigProto> backend_config() const {
+    ConfigProto proto;
+    TF_RETURN_IF_ERROR(GetBackendConfigInternal(&proto));
+    return std::move(proto);
   }
+  Status set_backend_config(const tensorflow::protobuf::Message& proto);
+
+  // Getter/setter for raw JSON-encoded backend config.  Prefer the
+  // functions above that deal in proto Messages where possible.
+  const string& raw_backend_config_string() const { return backend_config_; }
+  void set_raw_backend_config_string(string config_str) {
+    backend_config_ = std::move(config_str);
+  }
+
+  // Returns a string representation of a proto in the format used by
+  // raw_backend_config_string.
+  //
+  // This is morally equivalent to:
+  //
+  //   HloInstruction instr;
+  //   TF_RETURN_IF_ERROR(instr.set_backend_config(proto));
+  //   return instr.raw_backend_config_string();
+  //
+  static StatusOr<string> BackendConfigToRawString(
+      const tensorflow::protobuf::Message& proto);
 
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
@@ -1573,6 +1595,10 @@ class HloInstruction {
   // Returns how this instruction uses elements of its `i`th operand.
   UseKind OperandElementUse(int64 i) const;
 
+  // Helper for implementing backend_config().  Parses backend_config_ into the
+  // given proto.
+  Status GetBackendConfigInternal(tensorflow::protobuf::Message* proto) const;
+
   int unique_id_;  // Unique to this HloInstruction within a HloModule
 
   // Opcode for this instruction.
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
index 3c1d63ab86..ef10ca4bff 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
@@ -1127,7 +1127,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     instruction->set_metadata(*metadata);
   }
   if (backend_config) {
-    instruction->set_backend_config(std::move(*backend_config));
+    instruction->set_raw_backend_config_string(std::move(*backend_config));
   }
   return AddInstruction(name, instruction, name_loc);
 }  // NOLINT(readability/fn_size)
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
index f7a27cf9cc..3c5957b96a 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
@@ -1025,7 +1025,7 @@ ENTRY %configuration_test() -> s32[] {
   EXPECT_EQ("foo bar", result.ValueOrDie()
                            ->entry_computation()
                            ->root_instruction()
-                           ->backend_config());
+                           ->raw_backend_config_string());
 }
 
 TEST_F(HloParserTest, LiteralDimensionsMismatch_1) {
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 3286f856db..74f74afa45 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -101,42 +101,43 @@ load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 # For platform specific build config
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_platform_hdrs",
-    "tf_platform_srcs",
-    "tf_proto_library",
-    "tf_proto_library_cc",
     "tf_additional_all_protos",
+    "tf_additional_cloud_kernel_deps",
+    "tf_additional_cloud_op_deps",
     "tf_additional_core_deps",
+    "tf_additional_cupti_wrapper_deps",
+    "tf_additional_device_tracer_cuda_deps",
+    "tf_additional_device_tracer_deps",
+    "tf_additional_device_tracer_srcs",
+    "tf_additional_gdr_lib_defines",
+    "tf_additional_human_readable_json_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
+    "tf_additional_libdevice_data",
+    "tf_additional_libdevice_deps",
+    "tf_additional_libdevice_srcs",
     "tf_additional_lib_hdrs",
     "tf_additional_lib_srcs",
     "tf_additional_minimal_lib_srcs",
+    "tf_additional_mpi_lib_defines",
     "tf_additional_proto_hdrs",
     "tf_additional_proto_srcs",
-    "tf_additional_cupti_wrapper_deps",
-    "tf_additional_libdevice_data",
-    "tf_additional_libdevice_deps",
-    "tf_additional_libdevice_srcs",
     "tf_additional_test_deps",
     "tf_additional_test_srcs",
-    "tf_kernel_tests_linkstatic",
-    "tf_additional_cloud_op_deps",
-    "tf_additional_cloud_kernel_deps",
-    "tf_lib_proto_parsing_deps",
     "tf_additional_verbs_lib_defines",
-    "tf_additional_mpi_lib_defines",
-    "tf_additional_gdr_lib_defines",
-    "tf_additional_device_tracer_srcs",
-    "tf_additional_device_tracer_deps",
-    "tf_additional_device_tracer_cuda_deps",
-    "tf_pyclif_proto_library",
     "tf_jspb_proto_library",
+    "tf_kernel_tests_linkstatic",
+    "tf_lib_proto_parsing_deps",
     "tf_nano_proto_library",
+    "tf_platform_hdrs",
+    "tf_platform_srcs",
+    "tf_proto_library",
+    "tf_proto_library_cc",
     "tf_protos_all",
     "tf_protos_all_impl",
     "tf_protos_grappler",
     "tf_protos_grappler_impl",
+    "tf_pyclif_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -400,6 +401,7 @@ cc_library(
         "protobuf.cc",
     ]) + [
         "platform/protobuf_util.cc",
+        "lib/core/status.h",
     ],
     hdrs = [
         ":platform_protobuf_hdrs",
@@ -416,6 +418,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "human_readable_json",
+    srcs = tf_platform_srcs(["human_readable_json.cc"]),
+    hdrs = ["platform/human_readable_json.h"],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+    ] + tf_additional_human_readable_json_deps(),
+)
+
 filegroup(
     name = "platform_env_hdrs",
     srcs = [
@@ -2013,6 +2027,7 @@ cc_library(
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
             "platform/**/logging.cc",
+            "platform/**/human_readable_json.cc",
             "platform/abi.cc",
         ],
     ) + tf_additional_lib_srcs(
@@ -2025,6 +2040,7 @@ cc_library(
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
             "platform/**/logging.cc",
+            "platform/**/human_readable_json.cc",
             "platform/abi.cc",
         ] +
         # Protobuf deps already included through the ":lib_proto_parsing"
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 23c594d90d..43fe82cc13 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -515,6 +515,9 @@ def tf_additional_proto_srcs():
       "platform/default/protobuf.cc",
   ]
 
+def tf_additional_human_readable_json_deps():
+  return []
+
 def tf_additional_all_protos():
   return ["//tensorflow/core:protos_all"]
 
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
new file mode 100644
index 0000000000..6bf2106f6e
--- /dev/null
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -0,0 +1,54 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/human_readable_json.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+
+namespace tensorflow {
+
+Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
+                                string* result) {
+  result->clear();
+
+  auto status = google::protobuf::util::MessageToJsonString(proto, result);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece to
+    // tensorflow::StringPiece.
+    auto error_msg = status.error_message();
+    return errors::Internal(
+        strings::StrCat("Could not convert proto to JSON string: ",
+                        StringPiece(error_msg.data(), error_msg.length())));
+  }
+  return Status::OK();
+}
+
+Status HumanReadableJsonToProto(const string& str,
+                                ::google::protobuf::Message* proto) {
+  proto->Clear();
+  auto status = google::protobuf::util::JsonStringToMessage(str, proto);
+  if (!status.ok()) {
+    // Convert error_msg google::protobuf::StringPiece to
+    // tensorflow::StringPiece.
+    auto error_msg = status.error_message();
+    return errors::Internal(
+        strings::StrCat("Could not convert JSON string to proto: ",
+                        StringPiece(error_msg.data(), error_msg.length())));
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/human_readable_json.h b/tensorflow/core/platform/human_readable_json.h
new file mode 100644
index 0000000000..c759e801e9
--- /dev/null
+++ b/tensorflow/core/platform/human_readable_json.h
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
+#define TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Converts a proto to a JSON-like string that's meant to be human-readable
+// but still machine-parseable.
+//
+// This string may not be strictly JSON-compliant, but it must be parseable by
+// HumanReadableJSONToProto.
+Status ProtoToHumanReadableJson(const protobuf::Message& proto, string* result);
+
+// Converts a string produced by ProtoToHumanReadableJSON to a protobuf.  Not
+// guaranteed to work for general JSON.
+Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_HUMAN_READABLE_JSON_H_
-- 
GitLab


From fdf4d0813d4c0321be7b33698d00b165d90365b0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 11:52:43 -0700
Subject: [PATCH 107/610] RuntimeShapes class: minor tweak to fix builds.

PiperOrigin-RevId: 198755870
---
 tensorflow/contrib/lite/kernels/internal/types.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 98ca21d55a..fc8ed753c5 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -71,8 +71,8 @@ class RuntimeShape {
     }
   }
 
-  inline const int32 DimensionsCount() const { return size_; }
-  inline const int32 Dims(int i) const {
+  inline int32 DimensionsCount() const { return size_; }
+  inline int32 Dims(int i) const {
     TFLITE_DCHECK_GE(i, 0);
     TFLITE_DCHECK_LT(i, size_);
     return size_ > kMaxSmallSize ? dims_pointer_[i] : dims_[i];
@@ -123,7 +123,7 @@ class RuntimeShape {
 
   // Returns the total count of elements, that is the size when flattened into a
   // vector.
-  inline const int FlatSize() const {
+  inline int FlatSize() const {
     int buffer_size = 1;
     const int* dims_data = DimsData();
     for (int i = 0; i < size_; i++) {
-- 
GitLab


From 519189837b77181137505bf83054ddd962600f9b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 12:16:54 -0700
Subject: [PATCH 108/610] Making the tf.name_scope blocks related to the factor
 and weight vars configurable. By default they will not be scoped.

PiperOrigin-RevId: 198759754
---
 .../python/ops/factorization_ops.py           | 129 ++++++++++--------
 1 file changed, 74 insertions(+), 55 deletions(-)

diff --git a/tensorflow/contrib/factorization/python/ops/factorization_ops.py b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
index 09745e2de5..8f73274c2a 100644
--- a/tensorflow/contrib/factorization/python/ops/factorization_ops.py
+++ b/tensorflow/contrib/factorization/python/ops/factorization_ops.py
@@ -197,7 +197,8 @@ class WALSModel(object):
                row_weights=1,
                col_weights=1,
                use_factors_weights_cache=True,
-               use_gramian_cache=True):
+               use_gramian_cache=True,
+               use_scoped_vars=False):
     """Creates model for WALS matrix factorization.
 
     Args:
@@ -239,6 +240,8 @@ class WALSModel(object):
         weights cache to take effect.
       use_gramian_cache: When True, the Gramians will be cached on the workers
         before the updates start. Defaults to True.
+      use_scoped_vars: When True, the factor and weight vars will also be nested
+        in a tf.name_scope.
     """
     self._input_rows = input_rows
     self._input_cols = input_cols
@@ -251,18 +254,36 @@ class WALSModel(object):
         regularization * linalg_ops.eye(self._n_components)
         if regularization is not None else None)
     assert (row_weights is None) == (col_weights is None)
-    self._row_weights = WALSModel._create_weights(
-        row_weights, self._input_rows, self._num_row_shards, "row_weights")
-    self._col_weights = WALSModel._create_weights(
-        col_weights, self._input_cols, self._num_col_shards, "col_weights")
     self._use_factors_weights_cache = use_factors_weights_cache
     self._use_gramian_cache = use_gramian_cache
-    self._row_factors = self._create_factors(
-        self._input_rows, self._n_components, self._num_row_shards, row_init,
-        "row_factors")
-    self._col_factors = self._create_factors(
-        self._input_cols, self._n_components, self._num_col_shards, col_init,
-        "col_factors")
+
+    if use_scoped_vars:
+      with ops.name_scope("row_weights"):
+        self._row_weights = WALSModel._create_weights(
+            row_weights, self._input_rows, self._num_row_shards, "row_weights")
+      with ops.name_scope("col_weights"):
+        self._col_weights = WALSModel._create_weights(
+            col_weights, self._input_cols, self._num_col_shards, "col_weights")
+      with ops.name_scope("row_factors"):
+        self._row_factors = self._create_factors(
+            self._input_rows, self._n_components, self._num_row_shards,
+            row_init, "row_factors")
+      with ops.name_scope("col_factors"):
+        self._col_factors = self._create_factors(
+            self._input_cols, self._n_components, self._num_col_shards,
+            col_init, "col_factors")
+    else:
+      self._row_weights = WALSModel._create_weights(
+          row_weights, self._input_rows, self._num_row_shards, "row_weights")
+      self._col_weights = WALSModel._create_weights(
+          col_weights, self._input_cols, self._num_col_shards, "col_weights")
+      self._row_factors = self._create_factors(
+          self._input_rows, self._n_components, self._num_row_shards, row_init,
+          "row_factors")
+      self._col_factors = self._create_factors(
+          self._input_cols, self._n_components, self._num_col_shards, col_init,
+          "col_factors")
+
     self._row_gramian = self._create_gramian(self._n_components, "row_gramian")
     self._col_gramian = self._create_gramian(self._n_components, "col_gramian")
     with ops.name_scope("row_prepare_gramian"):
@@ -313,37 +334,36 @@ class WALSModel(object):
   @classmethod
   def _create_factors(cls, rows, cols, num_shards, init, name):
     """Helper function to create row and column factors."""
-    with ops.name_scope(name):
-      if callable(init):
-        init = init()
-      if isinstance(init, list):
-        assert len(init) == num_shards
-      elif isinstance(init, str) and init == "random":
-        pass
-      elif num_shards == 1:
-        init = [init]
-      sharded_matrix = []
-      sizes = cls._shard_sizes(rows, num_shards)
-      assert len(sizes) == num_shards
-
-      def make_initializer(i, size):
-
-        def initializer():
-          if init == "random":
-            return random_ops.random_normal([size, cols])
-          else:
-            return init[i]
+    if callable(init):
+      init = init()
+    if isinstance(init, list):
+      assert len(init) == num_shards
+    elif isinstance(init, str) and init == "random":
+      pass
+    elif num_shards == 1:
+      init = [init]
+    sharded_matrix = []
+    sizes = cls._shard_sizes(rows, num_shards)
+    assert len(sizes) == num_shards
+
+    def make_initializer(i, size):
 
-        return initializer
+      def initializer():
+        if init == "random":
+          return random_ops.random_normal([size, cols])
+        else:
+          return init[i]
 
-      for i, size in enumerate(sizes):
-        var_name = "%s_shard_%d" % (name, i)
-        var_init = make_initializer(i, size)
-        sharded_matrix.append(
-            variable_scope.variable(
-                var_init, dtype=dtypes.float32, name=var_name))
+      return initializer
 
-      return sharded_matrix
+    for i, size in enumerate(sizes):
+      var_name = "%s_shard_%d" % (name, i)
+      var_init = make_initializer(i, size)
+      sharded_matrix.append(
+          variable_scope.variable(
+              var_init, dtype=dtypes.float32, name=var_name))
+
+    return sharded_matrix
 
   @classmethod
   def _create_weights(cls, wt_init, num_wts, num_shards, name):
@@ -384,26 +404,25 @@ class WALSModel(object):
     sizes = cls._shard_sizes(num_wts, num_shards)
     assert len(sizes) == num_shards
 
-    with ops.name_scope(name):
-      def make_wt_initializer(i, size):
+    def make_wt_initializer(i, size):
 
-        def initializer():
-          if init_mode == "scalar":
-            return wt_init * array_ops.ones([size])
-          else:
-            return wt_init[i]
+      def initializer():
+        if init_mode == "scalar":
+          return wt_init * array_ops.ones([size])
+        else:
+          return wt_init[i]
 
-        return initializer
+      return initializer
 
-      sharded_weight = []
-      for i, size in enumerate(sizes):
-        var_name = "%s_shard_%d" % (name, i)
-        var_init = make_wt_initializer(i, size)
-        sharded_weight.append(
-            variable_scope.variable(
-                var_init, dtype=dtypes.float32, name=var_name))
+    sharded_weight = []
+    for i, size in enumerate(sizes):
+      var_name = "%s_shard_%d" % (name, i)
+      var_init = make_wt_initializer(i, size)
+      sharded_weight.append(
+          variable_scope.variable(
+              var_init, dtype=dtypes.float32, name=var_name))
 
-      return sharded_weight
+    return sharded_weight
 
   @staticmethod
   def _create_gramian(n_components, name):
-- 
GitLab


From ff28cfe18d69657cafcddadff6a36eb040c0cd7d Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 31 May 2018 12:38:35 -0700
Subject: [PATCH 109/610] Fix links in the TensorFlow Security Advisories

PiperOrigin-RevId: 198762795
---
 tensorflow/security/advisory/tfsa-2018-001.md |  4 ++--
 tensorflow/security/advisory/tfsa-2018-002.md |  2 +-
 tensorflow/security/advisory/tfsa-2018-003.md |  4 ++--
 tensorflow/security/advisory/tfsa-2018-004.md |  2 +-
 tensorflow/security/advisory/tfsa-2018-005.md |  2 +-
 tensorflow/security/advisory/tfsa-2018-006.md |  2 +-
 tensorflow/security/index.md                  | 12 ++++++------
 7 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/security/advisory/tfsa-2018-001.md b/tensorflow/security/advisory/tfsa-2018-001.md
index e62757fb5f..bb97543a21 100644
--- a/tensorflow/security/advisory/tfsa-2018-001.md
+++ b/tensorflow/security/advisory/tfsa-2018-001.md
@@ -21,8 +21,8 @@ TensorFlow 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0
 
 ### Mitigation
 
-We have patched the vulnerability in GitHub commits
-[https://github.com/tensorflow/tensorflow/commit/49f73c55d56edffebde4bca4a407ad69c1cae4333c55](49f73c55).
+We have patched the vulnerability in GitHub commit
+[49f73c55](https://github.com/tensorflow/tensorflow/commit/49f73c55d56edffebde4bca4a407ad69c1cae4333c55).
 If users are running TensorFlow in production or on untrusted data, they are
 encouraged to apply this patch.
 
diff --git a/tensorflow/security/advisory/tfsa-2018-002.md b/tensorflow/security/advisory/tfsa-2018-002.md
index baf3fb418e..fad7fdd40f 100644
--- a/tensorflow/security/advisory/tfsa-2018-002.md
+++ b/tensorflow/security/advisory/tfsa-2018-002.md
@@ -21,7 +21,7 @@ TensorFlow 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1 1.4.1, 1.5.0, 1.5.
 ### Mitigation
 
 We have patched the vulnerability in GitHub commit
-[https://github.com/tensorflow/tensorflow/commit/c48431588e7cf8aff61d4c299231e3e925144df8](c4843158).
+[c4843158](https://github.com/tensorflow/tensorflow/commit/c48431588e7cf8aff61d4c299231e3e925144df8).
 If users are running TensorFlow in production or on untrusted data, they are
 encouraged to apply this patch.
 
diff --git a/tensorflow/security/advisory/tfsa-2018-003.md b/tensorflow/security/advisory/tfsa-2018-003.md
index e20e358f29..747d37064c 100644
--- a/tensorflow/security/advisory/tfsa-2018-003.md
+++ b/tensorflow/security/advisory/tfsa-2018-003.md
@@ -35,8 +35,8 @@ TensorFlow 1.5.0, 1.5.1, 1.6.0, 1.7.0
 
 ### Mitigation
 
-We have patched the vulnerability in GitHub commits [https://github.com/tensorflow/tensorflow/commit/41335abb46f80ca644b5738550daef6136ba5476](41335abb) and
-[https://github.com/tensorflow/tensorflow/commit/41335abb46f80ca644b5738550daef6136ba5476](41335abb) and
+We have patched the vulnerability in GitHub commits [41335abb](https://github.com/tensorflow/tensorflow/commit/41335abb46f80ca644b5738550daef6136ba5476) and
+[8badd11d](https://github.com/tensorflow/tensorflow/commit/8badd11d875a826bd318ed439909d5c47a7fb811).
 If users are running the TensorFlow TFLite TOCO compiler in production or on
 untrusted data, they are encouraged to apply this patch.
 
diff --git a/tensorflow/security/advisory/tfsa-2018-004.md b/tensorflow/security/advisory/tfsa-2018-004.md
index d172247288..3af28defa1 100644
--- a/tensorflow/security/advisory/tfsa-2018-004.md
+++ b/tensorflow/security/advisory/tfsa-2018-004.md
@@ -22,7 +22,7 @@ TensorFlow 1.0.0, 1.0.1, 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0,
 ### Mitigation
 
 We have patched the vulnerability in GitHub commit
-[https://github.com/tensorflow/tensorflow/commit/d107fee1e4a9a4462f01564798d345802acc2aef](d107fee1).
+[d107fee1](https://github.com/tensorflow/tensorflow/commit/d107fee1e4a9a4462f01564798d345802acc2aef).
 If users are running TensorFlow on untrusted meta checkpoints, such as those
 downloaded from the Internet, in production or on untrusted data, they are
 encouraged to apply this patch.
diff --git a/tensorflow/security/advisory/tfsa-2018-005.md b/tensorflow/security/advisory/tfsa-2018-005.md
index 1c91567db5..c0f339fd97 100644
--- a/tensorflow/security/advisory/tfsa-2018-005.md
+++ b/tensorflow/security/advisory/tfsa-2018-005.md
@@ -22,7 +22,7 @@ TensorFlow 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0,
 ### Mitigation
 
 We have patched the vulnerability in GitHub commit
-[https://github.com/tensorflow/tensorflow/commit/dfa9921e6343727b05f42f8d4a918b19528ff994](dfa9921e) 
+[dfa9921e](https://github.com/tensorflow/tensorflow/commit/dfa9921e6343727b05f42f8d4a918b19528ff994)
 by upgrading the version of the snappy library used by TensorFlow to v1.1.7.
 
 If users are loading untrusted checkpoints in TensorFlow, we encourage users to
diff --git a/tensorflow/security/advisory/tfsa-2018-006.md b/tensorflow/security/advisory/tfsa-2018-006.md
index a1d1a9f3d1..17f514d8d2 100644
--- a/tensorflow/security/advisory/tfsa-2018-006.md
+++ b/tensorflow/security/advisory/tfsa-2018-006.md
@@ -21,7 +21,7 @@ TensorFlow 1.1.0, 1.2.0, 1.2.1, 1.3.0, 1.3.1, 1.4.0, 1.4.1, 1.5.0, 1.5.1, 1.6.0,
 ### Mitigation
 
 We have patched the vulnerability in GitHub commit
-[https://github.com/tensorflow/tensorflow/commit/c89ab82a82585cdaa90bf4911980e9e845909e78](c89ab82a).
+[c89ab82a](https://github.com/tensorflow/tensorflow/commit/c89ab82a82585cdaa90bf4911980e9e845909e78).
 
 If users are loading untrusted configurations in TensorFlow, we encourage users
 to apply the patch to upgrade snappy or upgrade the version of TensorFlow they
diff --git a/tensorflow/security/index.md b/tensorflow/security/index.md
index c1f9f1da74..44f51ad07b 100644
--- a/tensorflow/security/index.md
+++ b/tensorflow/security/index.md
@@ -8,11 +8,11 @@ in [https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](SECURITY.m
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
-| TFSA-2018-006   | Crafted Configuration File results in Invalid Memory Access | <= 1.7 | Blade Team of Tencent |  |
-| TFSA-2018-005   | Old Snappy Library Usage Resulting in Memcpy Parameter Overlap | <= 1.7 | Blade Team of Tencent |  |
-| TFSA-2018-004   | Checkpoint Meta File Out-of-Bounds Read | <= 1.7 | Blade Team of Tencent |  |
-| TFSA-2018-003   | TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent |  |
-| TFSA-2018-002   | GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent |  |
-| TFSA-2018-001   | BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent |  |
+| [TFSA-2018-006](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-006.md)   | Crafted Configuration File results in Invalid Memory Access | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-005](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-005.md)   | Old Snappy Library Usage Resulting in Memcpy Parameter Overlap | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-004](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-004.md)   | Checkpoint Meta File Out-of-Bounds Read | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-003](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-003.md)   | TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent |  |
+| [TFSA-2018-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-002.md)   | GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent |  |
+| [TFSA-2018-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-001.md)   | BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent |  |
 | -               | Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
-- 
GitLab


From eebbcaf554fb89059054936491763fde9cf9513d Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Thu, 31 May 2018 13:10:07 -0700
Subject: [PATCH 110/610] Add profiling statistics to benchmark.

PiperOrigin-RevId: 198767297
---
 tensorflow/contrib/lite/profiling/BUILD       |   7 +
 .../contrib/lite/profiling/profile_buffer.h   |  12 +-
 tensorflow/contrib/lite/profiling/time.cc     |  29 +
 tensorflow/contrib/lite/profiling/time.h      |  27 +
 tensorflow/contrib/lite/tools/BUILD           |  75 ++-
 .../contrib/lite/tools/benchmark_main.cc      |  37 ++
 .../contrib/lite/tools/benchmark_model.cc     | 518 +++---------------
 .../contrib/lite/tools/benchmark_model.h      | 161 ++++++
 .../lite/tools/benchmark_tflite_model.cc      | 352 ++++++++++++
 .../lite/tools/benchmark_tflite_model.h       |  90 +++
 .../contrib/lite/tools/command_line_flags.cc  | 189 +++++++
 .../contrib/lite/tools/command_line_flags.h   | 112 ++++
 .../lite/tools/command_line_flags_test.cc     | 153 ++++++
 tensorflow/contrib/lite/tools/logging.h       |  75 +++
 tensorflow/core/BUILD                         |   7 +-
 15 files changed, 1396 insertions(+), 448 deletions(-)
 create mode 100644 tensorflow/contrib/lite/profiling/time.cc
 create mode 100644 tensorflow/contrib/lite/profiling/time.h
 create mode 100644 tensorflow/contrib/lite/tools/benchmark_main.cc
 create mode 100644 tensorflow/contrib/lite/tools/benchmark_model.h
 create mode 100644 tensorflow/contrib/lite/tools/benchmark_tflite_model.cc
 create mode 100644 tensorflow/contrib/lite/tools/benchmark_tflite_model.h
 create mode 100644 tensorflow/contrib/lite/tools/command_line_flags.cc
 create mode 100644 tensorflow/contrib/lite/tools/command_line_flags.h
 create mode 100644 tensorflow/contrib/lite/tools/command_line_flags_test.cc
 create mode 100644 tensorflow/contrib/lite/tools/logging.h

diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
index c86be65ca7..c31189f2b1 100644
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -29,6 +29,13 @@ cc_library(
     name = "profile_buffer",
     hdrs = ["profile_buffer.h"],
     copts = common_copts,
+    deps = [":time"],
+)
+
+cc_library(
+    name = "time",
+    srcs = ["time.cc"],
+    hdrs = ["time.h"],
 )
 
 cc_library(
diff --git a/tensorflow/contrib/lite/profiling/profile_buffer.h b/tensorflow/contrib/lite/profiling/profile_buffer.h
index 299b2a9cad..65d86dce47 100644
--- a/tensorflow/contrib/lite/profiling/profile_buffer.h
+++ b/tensorflow/contrib/lite/profiling/profile_buffer.h
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
+#include "tensorflow/contrib/lite/profiling/time.h"
+
 namespace tflite {
 namespace profiling {
 
@@ -74,7 +76,7 @@ class ProfileBuffer {
     if (!enabled_) {
       return kInvalidEventHandle;
     }
-    uint64_t timestamp = NowMicros();
+    uint64_t timestamp = time::NowMicros();
     int index = current_index_ % event_buffer_.size();
     event_buffer_[index].tag = tag;
     event_buffer_[index].event_type = event_type;
@@ -103,7 +105,7 @@ class ProfileBuffer {
     }
 
     int event_index = event_handle % max_size;
-    event_buffer_[event_index].end_timestamp_us = NowMicros();
+    event_buffer_[event_index].end_timestamp_us = time::NowMicros();
   }
 
   // Returns the size of the buffer.
@@ -134,12 +136,6 @@ class ProfileBuffer {
   }
 
  private:
-  static uint64_t NowMicros() {
-    // TODO(shashishekhar): Refactor this to a separate file.
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
-  }
   bool enabled_;
   uint32_t current_index_;
   std::vector<ProfileEvent> event_buffer_;
diff --git a/tensorflow/contrib/lite/profiling/time.cc b/tensorflow/contrib/lite/profiling/time.cc
new file mode 100644
index 0000000000..446660bb74
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/time.cc
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/profiling/time.h"
+
+#include <sys/time.h>
+
+namespace tflite {
+namespace profiling {
+namespace time {
+uint64_t NowMicros() {
+  struct timeval tv;
+  gettimeofday(&tv, nullptr);
+  return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/profiling/time.h b/tensorflow/contrib/lite/profiling/time.h
new file mode 100644
index 0000000000..cc2ec319b8
--- /dev/null
+++ b/tensorflow/contrib/lite/profiling/time.h
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+#define TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
+
+#include <cstdint>
+
+namespace tflite {
+namespace profiling {
+namespace time {
+uint64_t NowMicros();
+}  // namespace time
+}  // namespace profiling
+}  // namespace tflite
+#endif  // TENSORFLOW_CONTRIB_LITE_PROFILING_TIME_H_
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 824a164651..7fb7517600 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -7,6 +7,8 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
 
+common_copts = ["-Wall"]
+
 py_binary(
     name = "visualize",
     srcs = ["visualize.py"],
@@ -30,7 +32,11 @@ tf_cc_binary(
 
 tf_cc_binary(
     name = "benchmark_model",
-    srcs = ["benchmark_model.cc"],
+    srcs = [
+        "benchmark_main.cc",
+        "logging.h",
+    ],
+    copts = common_copts,
     linkopts = select({
         "//tensorflow:android": [
             "-pie",
@@ -42,18 +48,67 @@ tf_cc_binary(
         "//conditions:default": [],
     }),
     deps = [
+        ":benchmark_tflite_model_lib",
+        "//tensorflow/core:stats_calculator_portable",
+    ],
+)
+
+cc_library(
+    name = "command_line_flags",
+    srcs = ["command_line_flags.cc"],
+    hdrs = ["command_line_flags.h"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+)
+
+cc_test(
+    name = "command_line_flags_test",
+    srcs = ["command_line_flags_test.cc"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "benchmark_tflite_model_lib",
+    srcs = [
+        "benchmark_tflite_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_tflite_model.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_model_lib",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite:string_util",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
-    ] + select({
-        "//tensorflow:android": [
-            "//tensorflow/core:android_tensorflow_lib",
-        ],
-        "//conditions:default": [
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-    }),
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
+        "//tensorflow/contrib/lite/profiling:profiler",
+    ],
+)
+
+cc_library(
+    name = "benchmark_model_lib",
+    srcs = [
+        "benchmark_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_model.h"],
+    copts = common_copts,
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
+        "//tensorflow/contrib/lite/profiling:profiler",
+        "//tensorflow/contrib/lite/profiling:time",
+        "//tensorflow/core:stats_calculator_portable",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/contrib/lite/tools/benchmark_main.cc b/tensorflow/contrib/lite/tools/benchmark_main.cc
new file mode 100644
index 0000000000..1325385e32
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark_main.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark_tflite_model.h"
+#include "tensorflow/contrib/lite/tools/logging.h"
+
+namespace tflite {
+namespace benchmark {
+
+int Main(int argc, char** argv) {
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  TFLITE_LOG(INFO) << "STARTING with custom ops!";
+#else
+  TFLITE_LOG(INFO) << "STARTING!";
+#endif
+  BenchmarkTfLiteModel benchmark;
+  BenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run(argc, argv);
+  return 0;
+}
+}  // namespace benchmark
+}  // namespace tflite
+
+int main(int argc, char** argv) { return tflite::benchmark::Main(argc, argv); }
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark_model.cc
index 869c531b3e..550994c662 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark_model.cc
@@ -13,463 +13,127 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <cstdarg>
-#include <cstdlib>
-#include <iostream>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include "tensorflow/contrib/lite/kernels/register.h"
-#include "tensorflow/contrib/lite/model.h"
-#include "tensorflow/contrib/lite/op_resolver.h"
-#include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/core/lib/strings/str_util.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/init_main.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-#ifdef TFLITE_CUSTOM_OPS_HEADER
-void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
-#endif
-
-namespace tflite {
-
-using ::tensorflow::Env;
-using ::tensorflow::str_util::Split;
-using ::tensorflow::str_util::SplitAndParseAsFloats;
-using ::tensorflow::str_util::SplitAndParseAsInts;
-
-struct InputLayerInfo {
-  string name;
-  TfLiteType data_type;
-  std::vector<int> shape;
-  // Note that initialization_values is currently unused.
-  std::vector<float> initialization_values;
-};
-
-template <typename T>
-void FillRandomValue(T* ptr, const std::vector<int>& sizes,
-                     const std::function<T()>& random_func) {
-  int num_elements = 1;
-  for (int dim : sizes) {
-    num_elements *= dim;
-  }
-  for (int i = 0; i < num_elements; ++i) {
-    *ptr++ = random_func();
-  }
-}
-
-void FillRandomString(tflite::DynamicBuffer* buffer,
-                      const std::vector<int>& sizes,
-                      const std::function<string()>& random_func) {
-  int num_elements = 1;
-  for (int dim : sizes) {
-    num_elements *= dim;
-  }
-  for (int i = 0; i < num_elements; ++i) {
-    auto str = random_func();
-    buffer->AddString(str.data(), str.length());
-  }
-}
-
-TfLiteType TfLiteTypeFromString(const string& input_layer_type) {
-  if (input_layer_type == "string")
-    return kTfLiteString;
-  else if (input_layer_type == "float")
-    return kTfLiteFloat32;
-  else if (input_layer_type == "uint8")
-    return kTfLiteUInt8;
-  else if (input_layer_type == "int32")
-    return kTfLiteInt32;
-  else if (input_layer_type == "int64")
-    return kTfLiteInt64;
-  else
-    return kTfLiteNoType;
-}
-
-std::vector<int> ShapeFromTfLiteTensor(TfLiteTensor* t) {
-  std::vector<int> result;
-  result.reserve(t->dims->size);
-  for (int i = 0; i < t->dims->size; ++i) {
-    result.push_back(t->dims->data[i]);
-  }
-  CHECK(!result.empty()) << "Found no shapes in model";
-  return result;
-}
-
-bool CreateInterpreter(const string& graph,
-                       std::unique_ptr<FlatBufferModel>* model,
-                       std::unique_ptr<Interpreter>* interpreter) {
-  *model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
-  if (!model) {
-    std::cerr << "Failed to load model " << graph << std::endl;
-    return false;
-  }
-
-#ifdef TFLITE_CUSTOM_OPS_HEADER
-  tflite::MutableOpResolver resolver;
-  RegisterSelectedOps(&resolver);
-#else
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-#endif
-
-  tflite::InterpreterBuilder(*(model->get()), resolver)(interpreter);
-  if (!(*interpreter)) {
-    std::cerr << "Failed to construct interpreter" << std::endl;
-    return false;
-  }
-
-  return true;
-}
-
-bool PrepareInterpreter(const std::vector<InputLayerInfo> inputs,
-                        int num_threads, bool use_nnapi,
-                        Interpreter* interpreter) {
-  if (num_threads != -1) {
-    interpreter->SetNumThreads(num_threads);
-  }
-
-  interpreter->UseNNAPI(use_nnapi);
-
-  // Check that all names and types match
-  for (const InputLayerInfo& input : inputs) {
-    for (int i : interpreter->inputs()) {
-      TfLiteTensor* t = interpreter->tensor(i);
-      CHECK_EQ(t->name, input.name)
-          << "Tensor # " << i << " is named " << t->name
-          << " but flags call it " << input.name;
-      CHECK_EQ(t->type, input.data_type)
-          << "Could not match the type of input tensor " << t->name;
-    }
-  }
-
-  // Resize all non-string tensors.
-  for (const InputLayerInfo& input : inputs) {
-    for (int i : interpreter->inputs()) {
-      TfLiteTensor* t = interpreter->tensor(i);
-      if (t->type != kTfLiteString) {
-        interpreter->ResizeInputTensor(i, input.shape);
-      }
-    }
-  }
-
-  if (interpreter->AllocateTensors() != kTfLiteOk) {
-    std::cerr << "Failed to allocate tensors!" << std::endl;
-    return false;
-  }
-
-  // Set the values of the input tensors.
-  for (int i : interpreter->inputs()) {
-    TfLiteTensor* t = interpreter->tensor(i);
-    std::vector<int> sizes = ShapeFromTfLiteTensor(t);
-
-    // TODO(ahentz): below we ignore the O-th dimension (number of batches).
-    if (t->type == kTfLiteFloat32) {
-      FillRandomValue<float>(
-          interpreter->typed_tensor<float>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
-    } else if (t->type == kTfLiteUInt8) {
-      FillRandomValue<uint8_t>(
-          interpreter->typed_tensor<uint8_t>(i),
-          std::vector<int>(sizes.begin() + 1, sizes.end()),
-          []() { return static_cast<uint8_t>(rand()) % 255; });
-    } else if (t->type == kTfLiteString) {
-      tflite::DynamicBuffer buffer;
-      FillRandomString(&buffer, sizes, []() {
-        return "we're have some friends over saturday to hang out in the yard";
-      });
-      buffer.WriteToTensor(interpreter->tensor(i));
-    } else {
-      std::cerr << "Don't know how to populate tensor " << t->name
-                << " of type " << t->type << std::endl;
-      return false;
-    }
-  }
-  return true;
-}
-
-bool PopulateInputLayerInfo(const string& names_string,
-                            const string& shapes_string,
-                            const string& types_string,
-                            const string& values_string,
-                            std::vector<InputLayerInfo>* info) {
-  std::vector<string> names = Split(names_string, ',');
-  std::vector<string> shapes = Split(shapes_string, ':');
-  std::vector<string> types = Split(types_string, ',');
-  std::vector<string> values = Split(values_string, ':');
-
-  if (names.size() != shapes.size()) {
-    LOG(ERROR) << "The number of items in"
-               << " --input_layer_shape (" << shapes_string << ", with "
-               << shapes.size() << " items)"
-               << " must match the number of items in"
-               << " --input_layer (" << names_string << ", with "
-               << names.size() << " items)."
-               << " For example --input_layer=input1,input2"
-               << " --input_layer_shape=1,224,224,4:1,20";
-    return false;
-  }
-  if (names.size() != types.size()) {
-    LOG(ERROR) << "The number of items in"
-               << " --input_layer_type (" << types_string << ", with "
-               << types.size() << " items)"
-               << " must match the number of items in"
-               << " --input_layer (" << names_string << ", with "
-               << names.size() << " items)."
-               << " For example --input_layer=input1,input2"
-               << " --input_layer_type=float,int";
-    return false;
-  }
-
-  for (int i = 0; i < names.size(); ++i) {
-    info->push_back(InputLayerInfo());
-    InputLayerInfo& input = info->back();
+#include "tensorflow/contrib/lite/tools/benchmark_model.h"
 
-    input.name = names[i];
+#include <time.h>
 
-    input.data_type = TfLiteTypeFromString(types[i]);
-    CHECK(input.data_type != kTfLiteNoType)
-        << types[i] << " was an invalid type";
-
-    CHECK(SplitAndParseAsInts(shapes[i], ',', &input.shape))
-        << "Incorrect size string specified: " << shapes[i];
-    for (int dim : input.shape) {
-      if (dim == -1) {
-        LOG(ERROR) << "Any unknown sizes in the shapes (-1's) must be replaced"
-                   << " with the size you want to benchmark with.";
-        return false;
-      }
-    }
-
-    if (i < values.size()) {
-      CHECK(SplitAndParseAsFloats(values[i], ',', &input.initialization_values))
-          << "Incorrect initialization values string specified: " << values[i];
-    }
-  }
-
-  return true;
-}
-
-bool RunBenchmark(Interpreter* interpreter, int64_t* inference_time_us) {
-  const int64_t start_time = Env::Default()->NowMicros();
-
-  if (interpreter->Invoke() != kTfLiteOk) {
-    std::cerr << "Failed to invoke!";
-    return false;
-  }
-
-  const int64_t end_time = Env::Default()->NowMicros();
-  *inference_time_us = end_time - start_time;
-  return true;
-}
-
-class Latencies {
- public:
-  void AddMeasurement(int64_t time_us) {
-    max_ = std::max(time_us, max_);
-    min_ = std::min(time_us, min_);
-    ++count_;
-    sum_ += time_us;
-    squared_sum_ += static_cast<double>(time_us) * time_us;
-  }
-
-  double avg() const {
-    if (count_ == 0) return std::numeric_limits<int64_t>::quiet_NaN();
-    return static_cast<double>(sum_) / count_;
-  }
+#include <iostream>
+#include <sstream>
 
-  int64_t std_deviation() const {
-    if (count_ == 0 || min_ == max_) return 0;
-    return sqrt(squared_sum_ / count_ - avg() * avg());
-  }
+#include "tensorflow/contrib/lite/profiling/time.h"
+#include "tensorflow/contrib/lite/tools/logging.h"
 
-  void OutputToStream(std::ostream* stream) const {
-    *stream << "count=" << count_;
-    if (count_ == 0) return;
-    *stream << " min=" << min_ << " max=" << max_;
-    *stream << " avg=" << avg() << " std=" << std_deviation();
+namespace {
+void SleepForSeconds(double sleep_seconds) {
+  if (sleep_seconds <= 0.0) {
+    return;
   }
-
- private:
-  int64_t count_ = 0;
-  int64_t min_ = std::numeric_limits<int64_t>::max();
-  int64_t max_ = std::numeric_limits<int64_t>::min();
-  int64_t sum_ = 0;
-  double squared_sum_ = 0;
-};
-
-bool TimeMultipleRuns(Interpreter* interpreter, double sleep_seconds,
-                      int num_runs, int64* total_time_us) {
   // Convert the run_delay string into a timespec.
   timespec req;
   req.tv_sec = static_cast<time_t>(sleep_seconds);
   req.tv_nsec = (sleep_seconds - req.tv_sec) * 1000000000;
-
-  *total_time_us = 0;
-
-  std::cout << "Running benchmark for " << num_runs
-            << " iterations: " << std::endl;
-
-  Latencies latencies;
-  for (int i = 0; i < num_runs; ++i) {
-    int64_t time_us;
-    bool run_status = RunBenchmark(interpreter, &time_us);
-    latencies.AddMeasurement(time_us);
-    *total_time_us += time_us;
-    if (!run_status) {
-      std::cout << "Failed on run " << i << std::endl;
-      return false;
-    }
-
-    // If requested, sleep between runs for an arbitrary amount of time.
-    // This can be helpful to determine the effect of mobile processor
-    // scaling and thermal throttling.
-    if (sleep_seconds > 0.0) {
+  // If requested, sleep between runs for an arbitrary amount of time.
+  // This can be helpful to determine the effect of mobile processor
+  // scaling and thermal throttling.
 #ifdef PLATFORM_WINDOWS
-      Sleep(sleep_seconds * 1000);
+  Sleep(sleep_seconds * 1000);
 #else
-      nanosleep(&req, nullptr);
+  nanosleep(&req, nullptr);
 #endif
-    }
-  }
-  latencies.OutputToStream(&std::cout);
-  std::cout << std::endl;
-
-  return true;
 }
 
-int Main(int argc, char** argv) {
-  using tensorflow::Flag;
-  using tensorflow::Flags;
+}  // namespace
 
-  string graph;               // e.g.: /data/local/tmp/tfl_inception-v1_model.fb
-  string input_layer_string;  // e.g.: input
-  string input_layer_shape_string;  // e.g.: 1,224,224,3
-  string input_layer_type_string;   // e.g.: float
-  string input_layer_values_string;
-  string output_layer_string;  // e.g.: output
-  int num_runs = 50;
-  string run_delay = "-1.0";
-  int num_threads = 1;
-  string benchmark_name = "";
-  string output_prefix = "";
-  int warmup_runs = 1;
-  bool use_nnapi = false;
+namespace tflite {
+namespace benchmark {
+using tensorflow::Stat;
+
+void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
+  auto inference_us = results.inference_time_us();
+  auto init_us = results.startup_latency_us();
+  auto warmup_us = results.warmup_time_us();
+  TFLITE_LOG(INFO) << "Average inference timings in us: "
+                   << "Warmup: " << warmup_us.avg() << ", "
+                   << "Init: " << init_us << ", "
+                   << "no stats: " << inference_us.avg();
+}
 
-  std::vector<Flag> flag_list = {
-      Flag("graph", &graph, "graph file name"),
-      // All the following flags are optional, but can be used in order
-      // to benchmark different input shapes.
-      Flag("input_layer", &input_layer_string, "input layer names"),
-      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
-      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
-      Flag("input_layer_values", &input_layer_values_string,
-           "values to initialize the inputs with"),
-      Flag("output_layer", &output_layer_string, "output layer name"),
-      Flag("num_runs", &num_runs, "number of runs"),
-      Flag("run_delay", &run_delay, "delay between runs in seconds"),
-      Flag("num_threads", &num_threads, "number of threads"),
-      Flag("benchmark_name", &benchmark_name, "benchmark name"),
-      Flag("output_prefix", &output_prefix, "benchmark output prefix"),
-      Flag("warmup_runs", &warmup_runs, "how many runs to initialize model"),
-      Flag("use_nnapi", &use_nnapi, "use nnapi api"),
+std::vector<Flag> BenchmarkModel::GetFlags() {
+  return {
+      Flag("num_runs", &params_.num_runs, "number of runs"),
+      Flag("run_delay", &params_.run_delay, "delay between runs in seconds"),
+      Flag("num_threads", &params_.num_threads, "number of threads"),
+      Flag("benchmark_name", &params_.benchmark_name, "benchmark name"),
+      Flag("output_prefix", &params_.output_prefix, "benchmark output prefix"),
+      Flag("warmup_runs", &params_.warmup_runs,
+           "how many runs to initialize model"),
   };
-  string usage = Flags::Usage(argv[0], flag_list);
-  const bool parse_result = Flags::Parse(&argc, argv, flag_list);
-  tensorflow::port::InitMain(argv[0], &argc, &argv);
+}
 
-  if (!parse_result) {
-    std::cerr << usage << std::endl;
-    return -1;
-  }
+void BenchmarkModel::LogFlags() {
+  TFLITE_LOG(INFO) << "Num runs: [" << params_.num_runs << "]";
+  TFLITE_LOG(INFO) << "Inter-run delay (seconds): [" << params_.run_delay
+                   << "]";
+  TFLITE_LOG(INFO) << "Num threads: [" << params_.num_threads << "]";
+  TFLITE_LOG(INFO) << "Benchmark name: [" << params_.benchmark_name << "]";
+  TFLITE_LOG(INFO) << "Output prefix: [" << params_.output_prefix << "]";
+  TFLITE_LOG(INFO) << "Warmup runs: [" << params_.warmup_runs << "]";
+}
 
-  std::cout << "Graph: [" << graph << "]" << std::endl;
-  if (!input_layer_string.empty()) {
-    std::cout << "Input layers: [" << input_layer_string << "]" << std::endl;
-    std::cout << "Input shapes: [" << input_layer_shape_string << "]"
-              << std::endl;
-    std::cout << "Input types: [" << input_layer_type_string << "]"
-              << std::endl;
-  }
-  if (!output_layer_string.empty()) {
-    std::cout << "Output layers: [" << output_layer_string << "]" << std::endl;
-  }
-  std::cout << "Num runs: [" << num_runs << "]" << std::endl;
-  std::cout << "Inter-run delay (seconds): [" << run_delay << "]" << std::endl;
-  std::cout << "Num threads: [" << num_threads << "]" << std::endl;
-  if (!benchmark_name.empty()) {
-    std::cout << "Benchmark name: [" << benchmark_name << "]" << std::endl;
-    std::cout << "Output prefix: [" << output_prefix << "]" << std::endl;
-  }
-  std::cout << "Warmup runs: [" << warmup_runs << "]" << std::endl;
-  std::cout << "Use nnapi : [" << use_nnapi << "]" << std::endl;
+Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
+  Stat<int64_t> run_stats;
+  TFLITE_LOG(INFO) << "Running benchmark for " << num_times << " iterations ";
+  for (int run = 0; run < num_times; run++) {
+    listeners_.OnSingleRunStart(run_type);
+    int64_t start_us = profiling::time::NowMicros();
+    RunImpl();
+    int64_t end_us = profiling::time::NowMicros();
+    listeners_.OnSingleRunEnd();
 
-  if (graph.empty()) {
-    std::cout
-        << "Please specify the name of your TF Lite input file with --graph"
-        << std::endl;
-    return -1;
+    run_stats.UpdateStat(end_us - start_us);
+    SleepForSeconds(params_.run_delay);
   }
 
-  std::vector<InputLayerInfo> inputs;
-  if (!PopulateInputLayerInfo(input_layer_string, input_layer_shape_string,
-                              input_layer_type_string,
-                              input_layer_values_string, &inputs)) {
-    return -1;
-  }
+  std::stringstream stream;
+  run_stats.OutputToStream(&stream);
+  TFLITE_LOG(INFO) << stream.str() << std::endl;
 
-  int64 initialization_start_us = Env::Default()->NowMicros();
+  return run_stats;
+}
 
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-  if (!CreateInterpreter(graph, &model, &interpreter)) {
-    return -1;
+void BenchmarkModel::Run(int argc, char **argv) {
+  if (!ParseFlags(argc, argv)) {
+    return;
   }
-  if (!PrepareInterpreter(inputs, num_threads, use_nnapi, interpreter.get())) {
-    return -1;
-  }
-
-  int64 initialization_end_us = Env::Default()->NowMicros();
 
-  const double initialization_time_s =
-      (initialization_end_us - initialization_start_us) / 1000000.0f;
-  std::cout << "Initialized session in " << initialization_time_s << "s"
-            << std::endl;
+  LogFlags();
 
-  const double sleep_seconds = std::strtod(run_delay.c_str(), nullptr);
+  listeners_.OnBenchmarkStart(params_);
+  int64_t initialization_start_us = profiling::time::NowMicros();
+  Init();
+  int64_t initialization_end_us = profiling::time::NowMicros();
+  int64_t startup_latency_us = initialization_end_us - initialization_start_us;
+  TFLITE_LOG(INFO) << "Initialized session in " << startup_latency_us / 1e3
+                   << "ms";
 
-  // If requested, run through the graph first to preinitialize everything
-  // before the benchmarking runs.
-  int64 warmup_time_us = 0;
-  if (warmup_runs > 0) {
-    if (!TimeMultipleRuns(interpreter.get(), sleep_seconds, warmup_runs,
-                          &warmup_time_us)) {
-      std::cerr << "Warmup failed" << std::endl;
-      return -1;
-    }
-  }
+  uint64_t input_bytes = ComputeInputBytes();
+  Stat<int64_t> warmup_time_us = Run(params_.warmup_runs, WARMUP);
+  Stat<int64_t> inference_time_us = Run(params_.num_runs, REGULAR);
+  listeners_.OnBenchmarkEnd(
+      {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
+}
 
-  // Capture overall inference time without stat logging overhead. This is the
-  // timing data that can be compared to other libaries.
-  int64 no_stat_time_us = 0;
-  if (!TimeMultipleRuns(interpreter.get(), sleep_seconds, num_runs,
-                        &no_stat_time_us)) {
-    std::cerr << "Timing failed." << std::endl;
-    return -1;
+bool BenchmarkModel::ParseFlags(int argc, char **argv) {
+  auto flag_list = GetFlags();
+  const bool parse_result =
+      Flags::Parse(&argc, const_cast<const char **>(argv), flag_list);
+  if (!parse_result) {
+    std::string usage = Flags::Usage(argv[0], flag_list);
+    TFLITE_LOG(ERROR) << usage;
+    return false;
   }
-
-  std::cout << "Average inference timings in us: " << no_stat_time_us / num_runs
-            << " , Warmup: "
-            << (warmup_runs > 0 ? warmup_time_us / warmup_runs : 0) << ", "
-            << std::endl;
-
-  return 0;
+  return ValidateFlags();
 }
 
+}  // namespace benchmark
 }  // namespace tflite
-
-int main(int argc, char** argv) { return ::tflite::Main(argc, argv); }
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.h b/tensorflow/contrib/lite/tools/benchmark_model.h
new file mode 100644
index 0000000000..ef8d6a7d1e
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark_model.h
@@ -0,0 +1,161 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
+
+#include <cmath>
+#include <limits>
+#include <ostream>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/tools//command_line_flags.h"
+#include "tensorflow/core/util/stats_calculator.h"
+
+namespace tflite {
+namespace benchmark {
+
+enum RunType {
+  WARMUP,
+  REGULAR,
+};
+
+class BenchmarkResults {
+ public:
+  BenchmarkResults(int64_t startup_latency_us, uint64_t input_bytes,
+                   tensorflow::Stat<int64_t> warmup_time_us,
+                   tensorflow::Stat<int64_t> inference_time_us)
+      : startup_latency_us_(startup_latency_us),
+        input_bytes_(input_bytes),
+        warmup_time_us_(warmup_time_us),
+        inference_time_us_(inference_time_us) {}
+
+  tensorflow::Stat<int64_t> inference_time_us() const {
+    return inference_time_us_;
+  }
+  tensorflow::Stat<int64_t> warmup_time_us() const { return warmup_time_us_; }
+  int64_t startup_latency_us() const { return startup_latency_us_; }
+  uint64_t input_bytes() const { return input_bytes_; }
+  double throughput_MB_per_second() const {
+    double bytes_per_sec = (input_bytes_ * inference_time_us_.count() * 1e6) /
+                           inference_time_us_.sum();
+    return bytes_per_sec / (1024.0 * 1024.0);
+  }
+
+ private:
+  int64_t startup_latency_us_;
+  uint64_t input_bytes_;
+  tensorflow::Stat<int64_t> warmup_time_us_;
+  tensorflow::Stat<int64_t> inference_time_us_;
+};
+
+struct BenchmarkParams {
+  BenchmarkParams()
+      : num_runs(50), warmup_runs(1), run_delay(-1.0), num_threads(1) {}
+  int num_runs;
+  int warmup_runs;
+  float run_delay;
+  int num_threads;
+  std::string benchmark_name;
+  std::string output_prefix;
+};
+
+class BenchmarkListener {
+ public:
+  virtual void OnBenchmarkStart(const BenchmarkParams& params) {}
+  virtual void OnSingleRunStart(RunType runType) {}
+  virtual void OnSingleRunEnd() {}
+  virtual void OnBenchmarkEnd(const BenchmarkResults& results) {}
+  virtual ~BenchmarkListener() {}
+};
+
+// A listener that forwards its method calls to a collection of listeners.
+class BenchmarkListeners : public BenchmarkListener {
+ public:
+  // Added a listener to the listener collection.
+  // |listener| is not owned by the instance of |BenchmarkListeners|.
+  // |listener| should not be null and should outlast the instance of
+  // |BenchmarkListeners|.
+  void AddListener(BenchmarkListener* listener) {
+    listeners_.push_back(listener);
+  }
+
+  void OnBenchmarkStart(const BenchmarkParams& params) override {
+    for (auto listener : listeners_) {
+      listener->OnBenchmarkStart(params);
+    }
+  }
+
+  void OnSingleRunStart(RunType runType) override {
+    for (auto listener : listeners_) {
+      listener->OnSingleRunStart(runType);
+    }
+  }
+
+  void OnSingleRunEnd() override {
+    for (auto listener : listeners_) {
+      listener->OnSingleRunEnd();
+    }
+  }
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    for (auto listener : listeners_) {
+      listener->OnBenchmarkEnd(results);
+    }
+  }
+
+  ~BenchmarkListeners() {}
+
+ private:
+  // Use vector so listeners are invoked in the order they are added.
+  std::vector<BenchmarkListener*> listeners_;
+};
+
+// Benchmark listener that just logs the results of benchmark run.
+class BenchmarkLoggingListener : public BenchmarkListener {
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+};
+
+// Benchmarks a model.
+//
+// Subclasses need to implement initialization and running of the model.
+// The results can be collected by adding BenchmarkListener(s).
+class BenchmarkModel {
+ public:
+  virtual ~BenchmarkModel() {}
+  bool ParseFlags(int argc, char** argv);
+  virtual void Init() = 0;
+  void Run(int argc, char** argv);
+  void AddListener(BenchmarkListener* listener) {
+    listeners_.AddListener(listener);
+  }
+
+ protected:
+  virtual void LogFlags();
+  virtual bool ValidateFlags() { return true; }
+  virtual std::vector<Flag> GetFlags();
+  virtual uint64_t ComputeInputBytes() = 0;
+  virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
+  virtual void RunImpl() = 0;
+  BenchmarkParams params_;
+  BenchmarkListeners listeners_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark_tflite_model.cc
new file mode 100644
index 0000000000..be8f46f599
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark_tflite_model.cc
@@ -0,0 +1,352 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/benchmark_tflite_model.h"
+
+#include <cstdarg>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#include "tensorflow/contrib/lite/tools/logging.h"
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
+#endif
+
+namespace tflite {
+namespace benchmark {
+
+void ProfilingListener::SetInterpreter(tflite::Interpreter* interpreter) {
+  TFLITE_BENCHMARK_CHECK(interpreter);
+  interpreter_ = interpreter;
+  interpreter_->SetProfiler(&profiler_);
+}
+
+void ProfilingListener::OnSingleRunStart(RunType run_type) {
+  if (run_type == REGULAR) {
+    profiler_.Reset();
+    profiler_.StartProfiling();
+  }
+}
+
+void ProfilingListener::OnBenchmarkEnd(const BenchmarkResults& results) {
+  if (has_profiles_) {
+    TFLITE_LOG(INFO) << summarizer_.GetOutputString();
+  }
+}
+
+void ProfilingListener::OnSingleRunEnd() {
+  profiler_.StopProfiling();
+  auto profile_events = profiler_.GetProfileEvents();
+  has_profiles_ = !profile_events.empty();
+  summarizer_.ProcessProfiles(profile_events, *interpreter_);
+}
+
+namespace {
+
+std::vector<std::string> Split(const std::string& str, const char delim) {
+  std::istringstream input(str);
+  std::vector<std::string> results;
+  std::string item;
+  while (std::getline(input, item, delim)) {
+    results.push_back(item);
+  }
+  return results;
+}
+
+template <typename T>
+bool SplitAndParse(const std::string& str, char delim, std::vector<T>* values) {
+  std::istringstream input(str);
+  bool first = true;
+  while (!input.eof()) {
+    if (!first) {
+      char c;
+      input >> c;
+      if (c != delim) {
+        return false;
+      }
+    } else {
+      first = false;
+    }
+    T val;
+    input >> val;
+    if (!input.eof() && !input.good()) {
+      return false;
+    }
+    values->push_back(val);
+  }
+  return true;
+}
+
+template <typename T>
+void FillRandomValue(T* ptr, const std::vector<int>& sizes,
+                     const std::function<T()>& random_func) {
+  int num_elements = 1;
+  for (int dim : sizes) {
+    num_elements *= dim;
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    *ptr++ = random_func();
+  }
+}
+
+void FillRandomString(tflite::DynamicBuffer* buffer,
+                      const std::vector<int>& sizes,
+                      const std::function<string()>& random_func) {
+  int num_elements = 1;
+  for (int dim : sizes) {
+    num_elements *= dim;
+  }
+  for (int i = 0; i < num_elements; ++i) {
+    auto str = random_func();
+    buffer->AddString(str.data(), str.length());
+  }
+}
+
+TfLiteType TfLiteTypeFromString(const string& input_layer_type) {
+  if (input_layer_type == "string")
+    return kTfLiteString;
+  else if (input_layer_type == "float")
+    return kTfLiteFloat32;
+  else if (input_layer_type == "uint8")
+    return kTfLiteUInt8;
+  else if (input_layer_type == "int32")
+    return kTfLiteInt32;
+  else if (input_layer_type == "int64")
+    return kTfLiteInt64;
+  else
+    return kTfLiteNoType;
+}
+
+bool PopulateInputLayerInfo(
+    const string& names_string, const string& shapes_string,
+    const string& types_string, const string& values_string,
+    std::vector<BenchmarkTfLiteModel::InputLayerInfo>* info) {
+  std::vector<std::string> names = Split(names_string, ',');
+  std::vector<std::string> shapes = Split(shapes_string, ':');
+  std::vector<std::string> types = Split(types_string, ',');
+  std::vector<std::string> values = Split(values_string, ':');
+
+  if (names.size() != shapes.size()) {
+    TFLITE_LOG(ERROR) << "The number of items in"
+                      << " --input_layer_shape (" << shapes_string << ", with "
+                      << shapes.size() << " items)"
+                      << " must match the number of items in"
+                      << " --input_layer (" << names_string << ", with "
+                      << names.size() << " items)."
+                      << " For example --input_layer=input1,input2"
+                      << " --input_layer_shape=1,224,224,4:1,20";
+    return false;
+  }
+  if (names.size() != types.size()) {
+    TFLITE_LOG(ERROR) << "The number of items in"
+                      << " --input_layer_type (" << types_string << ", with "
+                      << types.size() << " items)"
+                      << " must match the number of items in"
+                      << " --input_layer (" << names_string << ", with "
+                      << names.size() << " items)."
+                      << " For example --input_layer=input1,input2"
+                      << " --input_layer_type=float,int";
+    return false;
+  }
+
+  for (int i = 0; i < names.size(); ++i) {
+    info->push_back(BenchmarkTfLiteModel::InputLayerInfo());
+    BenchmarkTfLiteModel::InputLayerInfo& input = info->back();
+
+    input.name = names[i];
+
+    input.data_type = TfLiteTypeFromString(types[i]);
+    TFLITE_BENCHMARK_CHECK(input.data_type != kTfLiteNoType)
+        << types[i] << " was an invalid type";
+
+    TFLITE_BENCHMARK_CHECK(SplitAndParse(shapes[i], ',', &input.shape))
+        << "Incorrect size string specified: " << shapes[i];
+    for (int dim : input.shape) {
+      if (dim == -1) {
+        TFLITE_LOG(ERROR)
+            << "Any unknown sizes in the shapes (-1's) must be replaced"
+            << " with the size you want to benchmark with.";
+        return false;
+      }
+    }
+
+    if (i < values.size()) {
+      TFLITE_BENCHMARK_CHECK(
+          SplitAndParse(values[i], ',', &input.initialization_values))
+          << "Incorrect initialization values string specified: " << values[i];
+    }
+  }
+
+  return true;
+}
+
+}  // namespace
+
+std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
+  std::vector<Flag> flags = BenchmarkTfLiteModel::BenchmarkModel::GetFlags();
+  std::vector<Flag> specific_flags = {
+      Flag("graph", &graph, "graph file name"),
+      Flag("input_layer", &input_layer_string, "input layer names"),
+      Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
+      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
+      Flag("input_layer_values", &input_layer_values_string,
+           "values to initialize the inputs with"),
+      Flag("output_layer", &output_layer_string, "output layer name"),
+      Flag("use_nnapi", &use_nnapi, "use nnapi api")};
+
+  flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
+  return flags;
+}
+
+void BenchmarkTfLiteModel::LogFlags() {
+  BenchmarkModel::LogFlags();
+  TFLITE_LOG(INFO) << "Graph: [" << graph << "]";
+  TFLITE_LOG(INFO) << "Input layers: [" << input_layer_string << "]";
+  TFLITE_LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
+  TFLITE_LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
+  TFLITE_LOG(INFO) << "Output layers: [" << output_layer_string << "]";
+  TFLITE_LOG(INFO) << "Use nnapi : [" << use_nnapi << "]";
+}
+
+bool BenchmarkTfLiteModel::ValidateFlags() {
+  if (graph.empty()) {
+    TFLITE_LOG(ERROR)
+        << "Please specify the name of your TF Lite input file with --graph";
+    return false;
+  }
+  return PopulateInputLayerInfo(input_layer_string, input_layer_shape_string,
+                                input_layer_type_string,
+                                input_layer_values_string, &inputs);
+}
+
+uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
+  TFLITE_BENCHMARK_CHECK(interpreter);
+  uint64_t total_input_bytes = 0;
+  for (int input : interpreter->inputs()) {
+    auto* t = interpreter->tensor(input);
+    total_input_bytes += t->bytes;
+  }
+  return total_input_bytes;
+}
+
+void BenchmarkTfLiteModel::Init() {
+  model = tflite::FlatBufferModel::BuildFromFile(graph.c_str());
+  if (!model) {
+    TFLITE_LOG(FATAL) << "Failed to mmap model " << graph;
+  }
+  TFLITE_LOG(INFO) << "Loaded model " << graph;
+  model->error_reporter();
+  TFLITE_LOG(INFO) << "resolved reporter";
+
+#ifdef TFLITE_CUSTOM_OPS_HEADER
+  tflite::MutableOpResolver resolver;
+  RegisterSelectedOps(&resolver);
+#else
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+#endif
+
+  tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+  if (!interpreter) {
+    TFLITE_LOG(FATAL) << "Failed to construct interpreter";
+  }
+  profiling_listener_.SetInterpreter(interpreter.get());
+
+  if (params_.num_threads != -1) {
+    interpreter->SetNumThreads(params_.num_threads);
+  }
+
+  interpreter->UseNNAPI(use_nnapi);
+  auto interpreter_inputs = interpreter->inputs();
+
+  if (!inputs.empty()) {
+    TFLITE_BENCHMARK_CHECK_EQ(inputs.size(), interpreter_inputs.size())
+        << "Inputs mismatch: Model inputs #:" << interpreter_inputs.size()
+        << " expected: " << inputs.size();
+  }
+
+  // TFLITE_BENCHMARK_CHECK that all names and types match
+  for (int j = 0; j < inputs.size(); ++j) {
+    const InputLayerInfo& input = inputs[j];
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    TFLITE_BENCHMARK_CHECK_EQ(t->name, input.name)
+        << "Tensor # " << i << " is named " << t->name << " but flags call it "
+        << input.name;
+    TFLITE_BENCHMARK_CHECK_EQ(t->type, input.data_type)
+        << "Could not match the type of input tensor " << t->name;
+  }
+
+  // Resize all non-string tensors.
+  for (int j = 0; j < inputs.size(); ++j) {
+    const InputLayerInfo& input = inputs[j];
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    if (t->type != kTfLiteString) {
+      interpreter->ResizeInputTensor(i, input.shape);
+    }
+  }
+
+  if (interpreter->AllocateTensors() != kTfLiteOk) {
+    TFLITE_LOG(FATAL) << "Failed to allocate tensors!";
+  }
+
+  // Set the values of the input tensors.
+  for (int j = 0; j < inputs.size(); ++j) {
+    const InputLayerInfo& input = inputs[j];
+    int i = interpreter_inputs[j];
+    TfLiteTensor* t = interpreter->tensor(i);
+    std::vector<int> sizes = input.shape;
+
+    // TODO(ahentz): below we ignore the O-th dimension (number of batches).
+    if (t->type == kTfLiteFloat32) {
+      FillRandomValue<float>(
+          interpreter->typed_tensor<float>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<float>(rand()) / RAND_MAX - 0.5f; });
+    } else if (t->type == kTfLiteUInt8) {
+      FillRandomValue<uint8_t>(
+          interpreter->typed_tensor<uint8_t>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<uint8_t>(rand()) % 255; });
+    } else if (t->type == kTfLiteString) {
+      tflite::DynamicBuffer buffer;
+      FillRandomString(&buffer, sizes, []() {
+        return "we're have some friends over saturday to hang out in the yard";
+      });
+      buffer.WriteToTensor(interpreter->tensor(i));
+    } else {
+      TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
+                        << " of type " << t->type;
+    }
+  }
+}
+
+void BenchmarkTfLiteModel::RunImpl() {
+  if (interpreter->Invoke() != kTfLiteOk) {
+    TFLITE_LOG(FATAL) << "Failed to invoke!";
+  }
+}
+
+}  // namespace benchmark
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark_tflite_model.h
new file mode 100644
index 0000000000..e6d03d5211
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark_tflite_model.h
@@ -0,0 +1,90 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
+#include "tensorflow/contrib/lite/tools/benchmark_model.h"
+
+namespace tflite {
+namespace benchmark {
+
+// Dumps profiling events if profiling is enabled
+class ProfilingListener : public BenchmarkListener {
+ public:
+  explicit ProfilingListener() : interpreter_(nullptr), has_profiles_(false) {}
+
+  void SetInterpreter(Interpreter* interpreter);
+
+  void OnSingleRunStart(RunType run_type) override;
+
+  void OnSingleRunEnd() override;
+
+  void OnBenchmarkEnd(const BenchmarkResults& results) override;
+
+ private:
+  Interpreter* interpreter_;
+  profiling::Profiler profiler_;
+  profiling::ProfileSummarizer summarizer_;
+  bool has_profiles_;
+};
+
+// Benchmarks a TFLite model by running tflite interpreter.
+class BenchmarkTfLiteModel : public BenchmarkModel {
+ public:
+  BenchmarkTfLiteModel() : use_nnapi(false) {
+    AddListener(&profiling_listener_);
+  }
+
+  std::vector<Flag> GetFlags() override;
+  void LogFlags() override;
+  bool ValidateFlags() override;
+  uint64_t ComputeInputBytes() override;
+  void Init() override;
+  void RunImpl() override;
+  virtual ~BenchmarkTfLiteModel() {}
+
+  struct InputLayerInfo {
+    std::string name;
+    TfLiteType data_type;
+    std::vector<int> shape;
+    // Note that initialization_values is currently unused.
+    std::vector<float> initialization_values;
+  };
+
+ private:
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  std::string graph;
+  std::string input_layer_string;
+  std::string input_layer_type_string;
+  std::string input_layer_shape_string;
+  std::string input_layer_values_string;
+  std::string output_layer_string;
+  std::vector<InputLayerInfo> inputs;
+  bool use_nnapi;
+  ProfilingListener profiling_listener_;
+};
+
+}  // namespace benchmark
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/command_line_flags.cc b/tensorflow/contrib/lite/tools/command_line_flags.cc
new file mode 100644
index 0000000000..ba72f40689
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/command_line_flags.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/command_line_flags.h"
+
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace tflite {
+namespace {
+
+bool ParseFlag(const std::string& arg, const std::string& flag,
+               const std::function<bool(const std::string&)>& parse_func,
+               bool* value_parsing_ok) {
+  *value_parsing_ok = true;
+  std::string flag_prefix = "--" + flag + "=";
+  if (arg.find(flag_prefix) != 0) {
+    return false;
+  }
+  bool has_value = (arg.size() >= flag_prefix.size() + 1);
+  *value_parsing_ok = has_value;
+  if (has_value) {
+    *value_parsing_ok = parse_func(arg.substr(flag_prefix.size()));
+  }
+  return true;
+}
+
+bool ParseInt32Flag(const std::string& flag_value, int32_t* value) {
+  char extra;
+  return sscanf(flag_value.data(), "%d%c", value, &extra) == 1;
+}
+
+bool ParseInt64Flag(const std::string& flag_value, int64_t* value) {
+  char extra;
+  return sscanf(flag_value.data(), "%ld%c", value, &extra) == 1;
+}
+
+bool ParseBoolFlag(const std::string& flag_value, bool* value) {
+  if (flag_value != "true" && flag_value != "false") {
+    return false;
+  }
+
+  *value = (flag_value == "true");
+  return true;
+}
+
+bool ParseFloatFlag(const std::string& flag_value, float* value) {
+  char extra;
+  return sscanf(flag_value.data(), "%f%c", value, &extra) == 1;
+}
+
+bool ParseStringFlag(const std::string& flag_value, std::string* value) {
+  *value = flag_value;
+  return true;
+}
+
+}  // namespace
+
+Flag::Flag(const char* name, int32_t* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_INT32),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseInt32Flag(flag_value, dst);
+      }),
+      default_for_display_(std::to_string(*dst)),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, int64_t* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_INT64),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseInt64Flag(flag_value, dst);
+      }),
+      default_for_display_(std::to_string(*dst)),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, float* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_FLOAT),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseFloatFlag(flag_value, dst);
+      }),
+      default_for_display_(std::to_string(*dst)),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, bool* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_BOOL),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseBoolFlag(flag_value, dst);
+      }),
+      default_for_display_((*dst) ? "true" : "false"),
+      usage_text_(usage_text) {}
+
+Flag::Flag(const char* name, std::string* dst, const std::string& usage_text)
+    : name_(name),
+      type_(TYPE_STRING),
+      value_hook_([dst](const std::string& flag_value) {
+        return ParseStringFlag(flag_value, dst);
+      }),
+      default_for_display_(*dst),
+      usage_text_(usage_text) {}
+
+bool Flag::Parse(const std::string& arg, bool* value_parsing_ok) const {
+  return ParseFlag(arg, name_, value_hook_, value_parsing_ok);
+}
+
+std::string Flag::GetTypeName() const {
+  switch (type_) {
+    case TYPE_INT32:
+      return "int32";
+    case TYPE_INT64:
+      return "int64";
+    case TYPE_FLOAT:
+      return "float";
+    case TYPE_BOOL:
+      return "bool";
+    case TYPE_STRING:
+      return "string";
+  }
+
+  return "unknown";
+}
+
+/*static*/ bool Flags::Parse(int* argc, const char** argv,
+                             const std::vector<Flag>& flag_list) {
+  bool result = true;
+  std::vector<const char*> unknown_flags;
+  for (int i = 1; i < *argc; ++i) {
+    if (std::string(argv[i]) == "--") {
+      while (i < *argc) {
+        unknown_flags.push_back(argv[i]);
+        ++i;
+      }
+      break;
+    }
+
+    bool was_found = false;
+    for (const Flag& flag : flag_list) {
+      bool value_parsing_ok;
+      was_found = flag.Parse(argv[i], &value_parsing_ok);
+      if (!value_parsing_ok) {
+        result = false;
+      }
+      if (was_found) {
+        break;
+      }
+    }
+    if (!was_found) {
+      unknown_flags.push_back(argv[i]);
+    }
+  }
+  int dst = 1;  // Skip argv[0]
+  for (auto f : unknown_flags) {
+    argv[dst++] = f;
+  }
+  argv[dst++] = nullptr;
+  *argc = unknown_flags.size() + 1;
+  return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
+}
+
+/*static*/ std::string Flags::Usage(const std::string& cmdline,
+                                    const std::vector<Flag>& flag_list) {
+  std::ostringstream usage_text;
+  usage_text << "usage: " << cmdline << "\n";
+  if (!flag_list.empty()) {
+    usage_text << "Flags:\n";
+  }
+
+  for (const Flag& flag : flag_list) {
+    auto type_name = flag.GetTypeName();
+    usage_text << "\t";
+    usage_text << "--" << flag.name_ << "=" << flag.default_for_display_;
+    usage_text << "\t" << type_name << "\t" << flag.usage_text_ << "\n";
+  }
+  return usage_text.str();
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/tools/command_line_flags.h b/tensorflow/contrib/lite/tools/command_line_flags.h
new file mode 100644
index 0000000000..0605d3c9d4
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/command_line_flags.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+
+#include <functional>
+#include <string>
+#include <vector>
+
+namespace tflite {
+// A simple command-line argument parsing module.
+// Dependency free simplified port of core/util/command_line_flags.
+// This class is written for benchmarks and uses inefficient string
+// concatenation. This was written to avoid dependency on tensorflow/core/util
+// which transitively brings in a lot of other dependencies that are not
+// necessary for tflite benchmarking code.
+// The recommended way of using it is with local variables and an initializer
+// list of Flag objects, for example:
+//
+// int some_int = 10;
+// bool some_switch = false;
+// std::string some_name = "something";
+// std::vector<tensorFlow::Flag> flag_list = {
+//   Flag("some_int", &some_int, "an integer that affects X"),
+//   Flag("some_switch", &some_switch, "a bool that affects Y"),
+//   Flag("some_name", &some_name, "a std::string that affects Z")
+// };
+// // Get usage message before ParseFlags() to capture default values.
+// std::string usage = Flag::Usage(argv[0], flag_list);
+// bool parsed_values_ok = Flags::Parse(&argc, argv, flag_list);
+//
+// tensorflow::port::InitMain(usage.c_str(), &argc, &argv);
+// if (argc != 1 || !parsed_values_ok) {
+//    ...output usage and error message...
+// }
+//
+// The argc and argv values are adjusted by the Parse function so all that
+// remains is the program name (at argv[0]) and any unknown arguments fill the
+// rest of the array. This means you can check for flags that weren't understood
+// by seeing if argv is greater than 1.
+// The result indicates if there were any errors parsing the values that were
+// passed to the command-line switches. For example, --some_int=foo would return
+// false because the argument is expected to be an integer.
+//
+// NOTE: Unlike gflags-style libraries, this library is intended to be
+// used in the `main()` function of your binary. It does not handle
+// flag definitions that are scattered around the source code.
+
+// A description of a single command line flag, holding its name, type, usage
+// text, and a pointer to the corresponding variable.
+class Flag {
+ public:
+  Flag(const char* name, int32_t* dst, const std::string& usage_text);
+  Flag(const char* name, int64_t* dst, const std::string& usage_text);
+  Flag(const char* name, bool* dst, const std::string& usage_text);
+  Flag(const char* name, std::string* dst, const std::string& usage_text);
+  Flag(const char* name, float* dst, const std::string& usage_text);
+
+ private:
+  friend class Flags;
+
+  bool Parse(const std::string& arg, bool* value_parsing_ok) const;
+
+  std::string name_;
+  enum {
+    TYPE_INT32,
+    TYPE_INT64,
+    TYPE_BOOL,
+    TYPE_STRING,
+    TYPE_FLOAT,
+  } type_;
+
+  std::string GetTypeName() const;
+
+  std::function<bool(const std::string&)> value_hook_;
+  std::string default_for_display_;
+
+  std::string usage_text_;
+};
+
+class Flags {
+ public:
+  // Parse the command line represented by argv[0, ..., (*argc)-1] to find flag
+  // instances matching flags in flaglist[].  Update the variables associated
+  // with matching flags, and remove the matching arguments from (*argc, argv).
+  // Return true iff all recognized flag values were parsed correctly, and the
+  // first remaining argument is not "--help".
+  static bool Parse(int* argc, const char** argv,
+                    const std::vector<Flag>& flag_list);
+
+  // Return a usage message with command line cmdline, and the
+  // usage_text strings in flag_list[].
+  static std::string Usage(const std::string& cmdline,
+                           const std::vector<Flag>& flag_list);
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/tools/command_line_flags_test.cc b/tensorflow/contrib/lite/tools/command_line_flags_test.cc
new file mode 100644
index 0000000000..463647bec9
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/command_line_flags_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/contrib/lite/tools/command_line_flags.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+TEST(CommandLineFlagsTest, BasicUsage) {
+  int some_int32 = 10;
+  int64_t some_int64 = 21474836470;  // max int32 is 2147483647
+  bool some_switch = false;
+  std::string some_name = "something_a";
+  float some_float = -23.23f;
+  const char* argv_strings[] = {"program_name",
+                                "--some_int32=20",
+                                "--some_int64=214748364700",
+                                "--some_switch=true",
+                                "--some_name=somethingelse",
+                                "--some_float=42.0"};
+  int argc = 6;
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {
+                       Flag("some_int32", &some_int32, "some int32"),
+                       Flag("some_int64", &some_int64, "some int64"),
+                       Flag("some_switch", &some_switch, "some switch"),
+                       Flag("some_name", &some_name, "some name"),
+                       Flag("some_float", &some_float, "some float"),
+                   });
+
+  EXPECT_EQ(true, parsed_ok);
+  EXPECT_EQ(20, some_int32);
+  EXPECT_EQ(214748364700, some_int64);
+  EXPECT_EQ(true, some_switch);
+  EXPECT_EQ("somethingelse", some_name);
+  EXPECT_NEAR(42.0f, some_float, 1e-5f);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, BadIntValue) {
+  int some_int = 10;
+  int argc = 2;
+  const char* argv_strings[] = {"program_name", "--some_int=notanumber"};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_int", &some_int, "some int")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(10, some_int);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, BadBoolValue) {
+  bool some_switch = false;
+  int argc = 2;
+  const char* argv_strings[] = {"program_name", "--some_switch=notabool"};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_switch", &some_switch, "some switch")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_EQ(false, some_switch);
+  EXPECT_EQ(argc, 1);
+}
+
+TEST(CommandLineFlagsTest, BadFloatValue) {
+  float some_float = -23.23f;
+  int argc = 2;
+  const char* argv_strings[] = {"program_name", "--some_float=notanumber"};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_float", &some_float, "some float")});
+
+  EXPECT_EQ(false, parsed_ok);
+  EXPECT_NEAR(-23.23f, some_float, 1e-5f);
+  EXPECT_EQ(argc, 1);
+}
+
+// Return whether str==pat, but allowing any whitespace in pat
+// to match zero or more whitespace characters in str.
+static bool MatchWithAnyWhitespace(const std::string& str,
+                                   const std::string& pat) {
+  bool matching = true;
+  int pat_i = 0;
+  for (int str_i = 0; str_i != str.size() && matching; str_i++) {
+    if (isspace(str[str_i])) {
+      matching = (pat_i != pat.size() && isspace(pat[pat_i]));
+    } else {
+      while (pat_i != pat.size() && isspace(pat[pat_i])) {
+        pat_i++;
+      }
+      matching = (pat_i != pat.size() && str[str_i] == pat[pat_i++]);
+    }
+  }
+  while (pat_i != pat.size() && isspace(pat[pat_i])) {
+    pat_i++;
+  }
+  return (matching && pat_i == pat.size());
+}
+
+TEST(CommandLineFlagsTest, UsageString) {
+  int some_int = 10;
+  int64_t some_int64 = 21474836470;  // max int32 is 2147483647
+  bool some_switch = false;
+  std::string some_name = "something";
+  // Don't test float in this case, because precision is hard to predict and
+  // match against, and we don't want a flakey test.
+  const string tool_name = "some_tool_name";
+  string usage = Flags::Usage(tool_name + " <flags>",
+                              {Flag("some_int", &some_int, "some int"),
+                               Flag("some_int64", &some_int64, "some int64"),
+                               Flag("some_switch", &some_switch, "some switch"),
+                               Flag("some_name", &some_name, "some name")});
+  // Match the usage message, being sloppy about whitespace.
+  const char* expected_usage =
+      " usage: some_tool_name <flags>\n"
+      "Flags:\n"
+      "--some_int=10\tint32\tsome int\n"
+      "--some_int64=21474836470\tint64\tsome int64\n"
+      "--some_switch=false\tbool\tsome switch\n"
+      "--some_name=something\tstring\tsome name\n";
+  ASSERT_EQ(MatchWithAnyWhitespace(usage, expected_usage), true) << usage;
+
+  // Again but with no flags.
+  usage = Flags::Usage(tool_name, {});
+  ASSERT_EQ(MatchWithAnyWhitespace(usage, " usage: some_tool_name\n"), true)
+      << usage;
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/tools/logging.h b/tensorflow/contrib/lite/tools/logging.h
new file mode 100644
index 0000000000..aa1fa5b827
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/logging.h
@@ -0,0 +1,75 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
+#define TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
+
+// LOG and CHECK macros for benchmarks.
+
+#include <iostream>
+#include <sstream>
+
+namespace tflite {
+namespace logging {
+// A wrapper that logs to stderr.
+//
+// Used for TFLITE_LOG and TFLITE_BENCHMARK_CHECK macros.
+class LoggingWrapper {
+ public:
+  enum class LogSeverity : int {
+    INFO = 0,
+    WARN = 1,
+    ERROR = 2,
+    FATAL = 3,
+  };
+  LoggingWrapper(LogSeverity severity)
+      : severity_(severity), should_log_(true) {}
+  LoggingWrapper(LogSeverity severity, bool log)
+      : severity_(severity), should_log_(log) {}
+  std::stringstream& Stream() { return stream_; }
+  ~LoggingWrapper() {
+    if (should_log_) {
+      std::cerr << stream_.str() << std::endl;
+      if (severity_ == LogSeverity::FATAL) {
+        std::flush(std::cerr);
+        std::abort();
+      }
+    }
+  }
+
+ private:
+  std::stringstream stream_;
+  LogSeverity severity_;
+  bool should_log_;
+};
+
+}  // namespace logging
+
+}  // namespace tflite
+
+#define TFLITE_LOG(severity)                                  \
+  tflite::logging::LoggingWrapper(                            \
+      tflite::logging::LoggingWrapper::LogSeverity::severity) \
+      .Stream()
+
+#define TFLITE_BENCHMARK_CHECK(condition)                  \
+  tflite::logging::LoggingWrapper(                         \
+      tflite::logging::LoggingWrapper::LogSeverity::FATAL, \
+      (condition) ? false : true)                          \
+      .Stream()
+
+#define TFLITE_BENCHMARK_CHECK_EQ(a, b) TFLITE_BENCHMARK_CHECK(a == b)
+
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 74f74afa45..7e13a07e5e 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -843,7 +843,6 @@ tf_cuda_library(
         "util/sparse/sparse_tensor.h",
         "util/stat_summarizer.h",
         "util/stat_summarizer_options.h",
-        "util/stats_calculator.h",
         "util/stream_executor_util.h",
         "util/strided_slice_op.h",
         "util/tensor_format.h",
@@ -870,9 +869,11 @@ tf_cuda_library(
 
 cc_library(
     name = "stats_calculator_portable",
-    srcs = ["util/stats_calculator.cc"],
-    hdrs = [
+    srcs = [
         "util/stat_summarizer_options.h",
+        "util/stats_calculator.cc",
+    ],
+    hdrs = [
         "util/stats_calculator.h",
     ],
     deps = [":platform_base"],
-- 
GitLab


From 106191ccf06b49f7802736a63932a613546b56c5 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Thu, 31 May 2018 13:11:43 -0700
Subject: [PATCH 111/610] Moving generated API to tensorflow/.

PiperOrigin-RevId: 198767512
---
 tensorflow/BUILD                              |  17 ++-
 tensorflow/__init__.py                        |   3 -
 tensorflow/api_template.__init__.py           |  43 ++++++
 tensorflow/contrib/cmake/tf_python.cmake      |  18 +--
 tensorflow/contrib/cmake/tf_tests.cmake       |   4 +
 tensorflow/python/BUILD                       |   1 +
 tensorflow/python/kernel_tests/BUILD          |  58 ++++++++
 .../kernel_tests}/ackermann_op.cc             |   0
 .../kernel_tests}/ackermann_test.py           |  14 +-
 .../kernel_tests}/duplicate_op.cc             |   0
 .../kernel_tests}/duplicate_op_test.py        |  17 ++-
 .../kernel_tests}/invalid_op.cc               |   0
 .../kernel_tests}/invalid_op_test.py          |  17 ++-
 tensorflow/python/util/stat_summarizer.i      |   5 -
 tensorflow/tools/api/generator/BUILD          | 116 +---------------
 tensorflow/tools/api/generator/api_gen.bzl    | 125 ++++++++++++++++++
 .../tools/api/generator/create_python_api.py  |  85 ++++++++----
 tensorflow/user_ops/BUILD                     |  52 --------
 18 files changed, 342 insertions(+), 233 deletions(-)
 create mode 100644 tensorflow/api_template.__init__.py
 rename tensorflow/{user_ops => python/kernel_tests}/ackermann_op.cc (100%)
 rename tensorflow/{user_ops => python/kernel_tests}/ackermann_test.py (76%)
 rename tensorflow/{user_ops => python/kernel_tests}/duplicate_op.cc (100%)
 rename tensorflow/{user_ops => python/kernel_tests}/duplicate_op_test.py (69%)
 rename tensorflow/{user_ops => python/kernel_tests}/invalid_op.cc (100%)
 rename tensorflow/{user_ops => python/kernel_tests}/invalid_op_test.py (67%)
 create mode 100644 tensorflow/tools/api/generator/api_gen.bzl
 delete mode 100644 tensorflow/user_ops/BUILD

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f2ad16fa04..e0bce820d1 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -19,6 +19,10 @@ load(
     "//tensorflow/core:platform/default/build_config.bzl",
     "tf_additional_binary_deps",
 )
+load(
+    "//tensorflow/tools/api/generator:api_gen.bzl",
+    "gen_api_init_files",  # @unused
+)
 
 # Config setting for determining if we are building for Android.
 config_setting(
@@ -536,13 +540,16 @@ exports_files(
     ],
 )
 
+gen_api_init_files(
+    name = "python_api_gen",
+    srcs = ["api_template.__init__.py"],
+    root_init_template = "api_template.__init__.py",
+)
+
 py_library(
     name = "tensorflow_py",
-    srcs = ["__init__.py"],
+    srcs = [":python_api_gen"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/python",
-        "//tensorflow/tools/api/generator:python_api",
-    ],
+    deps = ["//tensorflow/python"],
 )
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
index c8683e3976..440e9f8dbd 100644
--- a/tensorflow/__init__.py
+++ b/tensorflow/__init__.py
@@ -22,9 +22,6 @@ from __future__ import print_function
 
 # pylint: disable=g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-# pylint: disable=wildcard-import
-from tensorflow.tools.api.generator.api import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
 
 from tensorflow.python.util.lazy_loader import LazyLoader
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
new file mode 100644
index 0000000000..9b0d7d48af
--- /dev/null
+++ b/tensorflow/api_template.__init__.py
@@ -0,0 +1,43 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bring in all of the public TensorFlow interface into this module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=g-bad-import-order
+from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
+# API IMPORTS PLACEHOLDER
+
+from tensorflow.python.util.lazy_loader import LazyLoader
+contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
+del LazyLoader
+
+from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
+app.flags = flags  # pylint: disable=undefined-variable
+
+del absolute_import
+del division
+del print_function
+
+# These symbols appear because we import the python package which
+# in turn imports from tensorflow.core and tensorflow.python. They
+# must come from this module. So python adds these symbols for the
+# resolution to succeed.
+# pylint: disable=undefined-variable
+del python
+del core
+# pylint: enable=undefined-variable
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index 61651f3007..d019dd48f2 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -725,7 +725,7 @@ endif()
 ########################################################
 
 # Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
-FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/BUILD api_generator_BUILD_text)
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/api_gen.bzl api_generator_BUILD_text)
 STRING(REGEX MATCH "# BEGIN GENERATED FILES.*# END GENERATED FILES" api_init_files_text ${api_generator_BUILD_text})
 string(REPLACE "# BEGIN GENERATED FILES" "" api_init_files_text ${api_init_files_text})
 string(REPLACE "# END GENERATED FILES" "" api_init_files_text ${api_init_files_text})
@@ -736,7 +736,7 @@ foreach(api_init_file ${api_init_files_list})
     string(STRIP "${api_init_file}" api_init_file)
     if(api_init_file)
         string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
-        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/${api_init_file}")
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/${api_init_file}")
     endif()
 endforeach(api_init_file)
 set(api_init_list_file "${tensorflow_source_dir}/api_init_files_list.txt")
@@ -749,18 +749,14 @@ add_custom_command(
 
       # tensorflow/__init__.py depends on files generated in this step. So, remove it while
       # this step is running since the files aren't there yet.
-      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
-      COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
 
       # Run create_python_api.py to generate API init files.
       COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
-              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py" "${api_init_list_file}"
-
-      # Re-add tensorflow/__init__.py back.
-      COMMAND ${CMAKE_COMMAND} -E remove -f ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
-      COMMAND ${CMAKE_COMMAND} -E rename ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/final.__init__.py
-                                         ${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/__init__.py
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+              "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
+              "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+              "${api_init_list_file}"
 
       COMMENT "Generating __init__.py files for Python API."
       WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
diff --git a/tensorflow/contrib/cmake/tf_tests.cmake b/tensorflow/contrib/cmake/tf_tests.cmake
index 5942ff3363..eb9482dc25 100644
--- a/tensorflow/contrib/cmake/tf_tests.cmake
+++ b/tensorflow/contrib/cmake/tf_tests.cmake
@@ -212,6 +212,10 @@ if (tensorflow_BUILD_PYTHON_TESTS)
     "${tensorflow_source_dir}/tensorflow/contrib/factorization/python/ops/gmm_test.py"
     # Disable following manual tag in BUILD.
     "${tensorflow_source_dir}/tensorflow/python/keras/_impl/keras/layers/convolutional_test.py"
+    # These tests depend on a .so file
+    ${tensorflow_source_dir}/tensorflow/python/kernel_tests/duplicate_op_test.py
+    ${tensorflow_source_dir}/tensorflow/python/kernel_tests/invalid_op_test.py
+    ${tensorflow_source_dir}/tensorflow/python/kernel_tests/ackermann_test.py
 
   )
   if (WIN32)
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 0542c2fc91..b15c5291f5 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -71,6 +71,7 @@ py_library(
     visibility = [
         "//tensorflow:__pkg__",
         "//tensorflow/python/tools:__pkg__",
+        "//tensorflow/tools/api/generator:__pkg__",
     ],
     deps = [
         ":array_ops",
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 3dfad9c130..5d29c2e5f8 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -9,6 +9,7 @@ licenses(["notice"])  # Apache 2.0
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "sycl_py_test")
+load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
 
 # CPU only tests should use tf_py_test, GPU tests use cuda_py_test
 # Please avoid the py_tests and cuda_py_tests (plural) while we
@@ -3029,3 +3030,60 @@ tf_py_test(
         "//tensorflow/python/eager:tape",
     ],
 )
+
+# Custom op tests
+tf_custom_op_library(
+    name = "ackermann_op.so",
+    srcs = ["ackermann_op.cc"],
+)
+
+tf_py_test(
+    name = "ackermann_test",
+    size = "small",
+    srcs = ["ackermann_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+    ],
+    data = [":ackermann_op.so"],
+    tags = ["no_pip"],
+)
+
+tf_custom_op_library(
+    name = "duplicate_op.so",
+    srcs = ["duplicate_op.cc"],
+)
+
+tf_py_test(
+    name = "duplicate_op_test",
+    size = "small",
+    srcs = ["duplicate_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+    ],
+    data = [":duplicate_op.so"],
+    tags = ["no_pip"],
+)
+
+tf_custom_op_library(
+    name = "invalid_op.so",
+    srcs = ["invalid_op.cc"],
+)
+
+tf_py_test(
+    name = "invalid_op_test",
+    size = "small",
+    srcs = ["invalid_op_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:platform",
+    ],
+    data = [":invalid_op.so"],
+    tags = ["no_pip"],
+)
diff --git a/tensorflow/user_ops/ackermann_op.cc b/tensorflow/python/kernel_tests/ackermann_op.cc
similarity index 100%
rename from tensorflow/user_ops/ackermann_op.cc
rename to tensorflow/python/kernel_tests/ackermann_op.cc
diff --git a/tensorflow/user_ops/ackermann_test.py b/tensorflow/python/kernel_tests/ackermann_test.py
similarity index 76%
rename from tensorflow/user_ops/ackermann_test.py
rename to tensorflow/python/kernel_tests/ackermann_test.py
index 257de49808..5e0d87c783 100644
--- a/tensorflow/user_ops/ackermann_test.py
+++ b/tensorflow/python/kernel_tests/ackermann_test.py
@@ -17,17 +17,19 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 
-import tensorflow as tf
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
 
 
-class AckermannTest(tf.test.TestCase):
+class AckermannTest(test.TestCase):
 
   def testBasic(self):
-    library_filename = os.path.join(tf.resource_loader.get_data_files_path(),
+    library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'ackermann_op.so')
-    ackermann = tf.load_op_library(library_filename)
+    ackermann = load_library.load_op_library(library_filename)
 
     self.assertEqual(len(ackermann.OP_LIST.op), 1)
     self.assertEqual(ackermann.OP_LIST.op[0].name, 'Ackermann')
@@ -37,4 +39,4 @@ class AckermannTest(tf.test.TestCase):
 
 
 if __name__ == '__main__':
-  tf.test.main()
+  test.main()
diff --git a/tensorflow/user_ops/duplicate_op.cc b/tensorflow/python/kernel_tests/duplicate_op.cc
similarity index 100%
rename from tensorflow/user_ops/duplicate_op.cc
rename to tensorflow/python/kernel_tests/duplicate_op.cc
diff --git a/tensorflow/user_ops/duplicate_op_test.py b/tensorflow/python/kernel_tests/duplicate_op_test.py
similarity index 69%
rename from tensorflow/user_ops/duplicate_op_test.py
rename to tensorflow/python/kernel_tests/duplicate_op_test.py
index b61e68d75e..529d3dd0b3 100644
--- a/tensorflow/user_ops/duplicate_op_test.py
+++ b/tensorflow/python/kernel_tests/duplicate_op_test.py
@@ -17,23 +17,26 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 
-import tensorflow as tf
+from tensorflow.python.framework import load_library
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
 
 
-class DuplicateOpTest(tf.test.TestCase):
+class DuplicateOpTest(test.TestCase):
 
   def testBasic(self):
-    library_filename = os.path.join(tf.resource_loader.get_data_files_path(),
+    library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'duplicate_op.so')
-    duplicate = tf.load_op_library(library_filename)
+    duplicate = load_library.load_op_library(library_filename)
 
     self.assertEqual(len(duplicate.OP_LIST.op), 0)
 
     with self.test_session():
-      self.assertEqual(tf.add(1, 41).eval(), 42)
+      self.assertEqual(math_ops.add(1, 41).eval(), 42)
 
 
 if __name__ == '__main__':
-  tf.test.main()
+  test.main()
diff --git a/tensorflow/user_ops/invalid_op.cc b/tensorflow/python/kernel_tests/invalid_op.cc
similarity index 100%
rename from tensorflow/user_ops/invalid_op.cc
rename to tensorflow/python/kernel_tests/invalid_op.cc
diff --git a/tensorflow/user_ops/invalid_op_test.py b/tensorflow/python/kernel_tests/invalid_op_test.py
similarity index 67%
rename from tensorflow/user_ops/invalid_op_test.py
rename to tensorflow/python/kernel_tests/invalid_op_test.py
index c90a00ce58..238299a895 100644
--- a/tensorflow/user_ops/invalid_op_test.py
+++ b/tensorflow/python/kernel_tests/invalid_op_test.py
@@ -17,19 +17,22 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os.path
+import os
 
-import tensorflow as tf
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import load_library
+from tensorflow.python.platform import resource_loader
+from tensorflow.python.platform import test
 
 
-class InvalidOpTest(tf.test.TestCase):
+class InvalidOpTest(test.TestCase):
 
   def testBasic(self):
-    library_filename = os.path.join(tf.resource_loader.get_data_files_path(),
+    library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'invalid_op.so')
-    with self.assertRaises(tf.errors.InvalidArgumentError):
-      tf.load_op_library(library_filename)
+    with self.assertRaises(errors.InvalidArgumentError):
+      load_library.load_op_library(library_filename)
 
 
 if __name__ == '__main__':
-  tf.test.main()
+  test.main()
diff --git a/tensorflow/python/util/stat_summarizer.i b/tensorflow/python/util/stat_summarizer.i
index f423553faa..73fa85494b 100644
--- a/tensorflow/python/util/stat_summarizer.i
+++ b/tensorflow/python/util/stat_summarizer.i
@@ -88,9 +88,4 @@ def NewStatSummarizer(unused):
 
 def DeleteStatSummarizer(stat_summarizer):
   _DeleteStatSummarizer(stat_summarizer)
-
-NewStatSummarizer._tf_api_names = ["contrib.stat_summarizer.NewStatSummarizer"]
-DeleteStatSummarizer._tf_api_names = [
-    "contrib.stat_summarizer.DeleteStatSummarizer"]
-StatSummarizer._tf_api_names = ["contrib.stat_summarizer.StatSummarizer"]
 %}
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index f46bb4b5fc..f0c5877a90 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -9,8 +9,9 @@ py_binary(
     name = "create_python_api",
     srcs = ["create_python_api.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python",
+        "//tensorflow/python:no_contrib",
     ],
 )
 
@@ -23,116 +24,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
-genrule(
-    name = "python_api_gen",
-    # List of API files. This list should include file name for
-    # every module exported using tf_export. For e.g. if an op is decorated with
-    # @tf_export('module1.module2', 'module3'). Then, outs should include
-    # api/module1/module2/__init__.py and api/module3/__init__.py.
-    # keep sorted
-    outs = [
-        # BEGIN GENERATED FILES
-        "api/__init__.py",
-        "api/app/__init__.py",
-        "api/bitwise/__init__.py",
-        "api/compat/__init__.py",
-        "api/contrib/__init__.py",
-        "api/contrib/stat_summarizer/__init__.py",
-        "api/data/__init__.py",
-        "api/distributions/__init__.py",
-        "api/distributions/bijectors/__init__.py",
-        "api/errors/__init__.py",
-        "api/estimator/__init__.py",
-        "api/estimator/export/__init__.py",
-        "api/estimator/inputs/__init__.py",
-        "api/feature_column/__init__.py",
-        "api/gfile/__init__.py",
-        "api/graph_util/__init__.py",
-        "api/image/__init__.py",
-        "api/initializers/__init__.py",
-        "api/keras/__init__.py",
-        "api/keras/activations/__init__.py",
-        "api/keras/applications/__init__.py",
-        "api/keras/applications/densenet/__init__.py",
-        "api/keras/applications/inception_resnet_v2/__init__.py",
-        "api/keras/applications/inception_v3/__init__.py",
-        "api/keras/applications/mobilenet/__init__.py",
-        "api/keras/applications/nasnet/__init__.py",
-        "api/keras/applications/resnet50/__init__.py",
-        "api/keras/applications/vgg16/__init__.py",
-        "api/keras/applications/vgg19/__init__.py",
-        "api/keras/applications/xception/__init__.py",
-        "api/keras/backend/__init__.py",
-        "api/keras/callbacks/__init__.py",
-        "api/keras/constraints/__init__.py",
-        "api/keras/datasets/__init__.py",
-        "api/keras/datasets/boston_housing/__init__.py",
-        "api/keras/datasets/cifar10/__init__.py",
-        "api/keras/datasets/cifar100/__init__.py",
-        "api/keras/datasets/fashion_mnist/__init__.py",
-        "api/keras/datasets/imdb/__init__.py",
-        "api/keras/datasets/mnist/__init__.py",
-        "api/keras/datasets/reuters/__init__.py",
-        "api/keras/estimator/__init__.py",
-        "api/keras/initializers/__init__.py",
-        "api/keras/layers/__init__.py",
-        "api/keras/losses/__init__.py",
-        "api/keras/metrics/__init__.py",
-        "api/keras/models/__init__.py",
-        "api/keras/optimizers/__init__.py",
-        "api/keras/preprocessing/__init__.py",
-        "api/keras/preprocessing/image/__init__.py",
-        "api/keras/preprocessing/sequence/__init__.py",
-        "api/keras/preprocessing/text/__init__.py",
-        "api/keras/regularizers/__init__.py",
-        "api/keras/utils/__init__.py",
-        "api/keras/wrappers/__init__.py",
-        "api/keras/wrappers/scikit_learn/__init__.py",
-        "api/layers/__init__.py",
-        "api/linalg/__init__.py",
-        "api/logging/__init__.py",
-        "api/losses/__init__.py",
-        "api/manip/__init__.py",
-        "api/math/__init__.py",
-        "api/metrics/__init__.py",
-        "api/nn/__init__.py",
-        "api/nn/rnn_cell/__init__.py",
-        "api/profiler/__init__.py",
-        "api/python_io/__init__.py",
-        "api/resource_loader/__init__.py",
-        "api/strings/__init__.py",
-        "api/saved_model/__init__.py",
-        "api/saved_model/builder/__init__.py",
-        "api/saved_model/constants/__init__.py",
-        "api/saved_model/loader/__init__.py",
-        "api/saved_model/main_op/__init__.py",
-        "api/saved_model/signature_constants/__init__.py",
-        "api/saved_model/signature_def_utils/__init__.py",
-        "api/saved_model/tag_constants/__init__.py",
-        "api/saved_model/utils/__init__.py",
-        "api/sets/__init__.py",
-        "api/sparse/__init__.py",
-        "api/spectral/__init__.py",
-        "api/summary/__init__.py",
-        "api/sysconfig/__init__.py",
-        "api/test/__init__.py",
-        "api/train/__init__.py",
-        "api/train/queue_runner/__init__.py",
-        "api/user_ops/__init__.py",
-        # END GENERATED FILES
-    ],
-    cmd = "$(location create_python_api) $(OUTS)",
-    tools = ["create_python_api"],
-)
-
-py_library(
-    name = "python_api",
-    srcs = [":python_api_gen"],
-    srcs_version = "PY2AND3",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = [
-        "//tensorflow/contrib:contrib_py",  # keep
-        "//tensorflow/python",  # keep
-    ],
-)
diff --git a/tensorflow/tools/api/generator/api_gen.bzl b/tensorflow/tools/api/generator/api_gen.bzl
new file mode 100644
index 0000000000..fe3e4d1434
--- /dev/null
+++ b/tensorflow/tools/api/generator/api_gen.bzl
@@ -0,0 +1,125 @@
+"""Targets for generating TensorFlow Python API __init__.py files."""
+
+# keep sorted
+TENSORFLOW_API_INIT_FILES = [
+    # BEGIN GENERATED FILES
+    "__init__.py",
+    "app/__init__.py",
+    "bitwise/__init__.py",
+    "compat/__init__.py",
+    "data/__init__.py",
+    "distributions/__init__.py",
+    "distributions/bijectors/__init__.py",
+    "errors/__init__.py",
+    "estimator/__init__.py",
+    "estimator/export/__init__.py",
+    "estimator/inputs/__init__.py",
+    "feature_column/__init__.py",
+    "gfile/__init__.py",
+    "graph_util/__init__.py",
+    "image/__init__.py",
+    "initializers/__init__.py",
+    "keras/__init__.py",
+    "keras/activations/__init__.py",
+    "keras/applications/__init__.py",
+    "keras/applications/densenet/__init__.py",
+    "keras/applications/inception_resnet_v2/__init__.py",
+    "keras/applications/inception_v3/__init__.py",
+    "keras/applications/mobilenet/__init__.py",
+    "keras/applications/nasnet/__init__.py",
+    "keras/applications/resnet50/__init__.py",
+    "keras/applications/vgg16/__init__.py",
+    "keras/applications/vgg19/__init__.py",
+    "keras/applications/xception/__init__.py",
+    "keras/backend/__init__.py",
+    "keras/callbacks/__init__.py",
+    "keras/constraints/__init__.py",
+    "keras/datasets/__init__.py",
+    "keras/datasets/boston_housing/__init__.py",
+    "keras/datasets/cifar10/__init__.py",
+    "keras/datasets/cifar100/__init__.py",
+    "keras/datasets/fashion_mnist/__init__.py",
+    "keras/datasets/imdb/__init__.py",
+    "keras/datasets/mnist/__init__.py",
+    "keras/datasets/reuters/__init__.py",
+    "keras/estimator/__init__.py",
+    "keras/initializers/__init__.py",
+    "keras/layers/__init__.py",
+    "keras/losses/__init__.py",
+    "keras/metrics/__init__.py",
+    "keras/models/__init__.py",
+    "keras/optimizers/__init__.py",
+    "keras/preprocessing/__init__.py",
+    "keras/preprocessing/image/__init__.py",
+    "keras/preprocessing/sequence/__init__.py",
+    "keras/preprocessing/text/__init__.py",
+    "keras/regularizers/__init__.py",
+    "keras/utils/__init__.py",
+    "keras/wrappers/__init__.py",
+    "keras/wrappers/scikit_learn/__init__.py",
+    "layers/__init__.py",
+    "linalg/__init__.py",
+    "logging/__init__.py",
+    "losses/__init__.py",
+    "manip/__init__.py",
+    "math/__init__.py",
+    "metrics/__init__.py",
+    "nn/__init__.py",
+    "nn/rnn_cell/__init__.py",
+    "profiler/__init__.py",
+    "python_io/__init__.py",
+    "resource_loader/__init__.py",
+    "strings/__init__.py",
+    "saved_model/__init__.py",
+    "saved_model/builder/__init__.py",
+    "saved_model/constants/__init__.py",
+    "saved_model/loader/__init__.py",
+    "saved_model/main_op/__init__.py",
+    "saved_model/signature_constants/__init__.py",
+    "saved_model/signature_def_utils/__init__.py",
+    "saved_model/tag_constants/__init__.py",
+    "saved_model/utils/__init__.py",
+    "sets/__init__.py",
+    "sparse/__init__.py",
+    "spectral/__init__.py",
+    "summary/__init__.py",
+    "sysconfig/__init__.py",
+    "test/__init__.py",
+    "train/__init__.py",
+    "train/queue_runner/__init__.py",
+    "user_ops/__init__.py",
+    # END GENERATED FILES
+]
+
+# Creates a genrule that generates a directory structure with __init__.py
+# files that import all exported modules (i.e. modules with tf_export
+# decorators).
+#
+# Args:
+#   name: name of genrule to create.
+#   output_files: List of __init__.py files that should be generated.
+#     This list should include file name for every module exported using
+#     tf_export. For e.g. if an op is decorated with
+#     @tf_export('module1.module2', 'module3'). Then, output_files should
+#     include module1/module2/__init__.py and module3/__init__.py.
+#   root_init_template: Python init file that should be used as template for
+#     root __init__.py file. "# API IMPORTS PLACEHOLDER" comment inside this
+#     template will be replaced with root imports collected by this genrule.
+#   srcs: genrule sources. If passing root_init_template, the template file
+#     must be included in sources.
+def gen_api_init_files(name,
+                       output_files=TENSORFLOW_API_INIT_FILES,
+                       root_init_template=None,
+                       srcs=[]):
+  root_init_template_flag = ""
+  if root_init_template:
+    root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
+  native.genrule(
+      name = name,
+      outs = output_files,
+      cmd = (
+          "$(location //tensorflow/tools/api/generator:create_python_api) " +
+          root_init_template_flag + " --apidir=$(@D) $(OUTS)"),
+      srcs = srcs,
+      tools = ["//tensorflow/tools/api/generator:create_python_api"],
+  )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 9cb137df5a..de0a50ab44 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -29,9 +29,13 @@ from tensorflow.python.util import tf_decorator
 
 _API_CONSTANTS_ATTR = '_tf_api_constants'
 _API_NAMES_ATTR = '_tf_api_names'
-_API_DIR = '/api/'
 _DEFAULT_PACKAGE = 'tensorflow.python'
-_OUTPUT_MODULE = 'tensorflow.tools.api.generator.api'
+_GENFILES_DIR_SUFFIX = 'genfiles/'
+_SYMBOLS_TO_SKIP_EXPLICITLY = {
+    # Overrides __getattr__, so that unwrapping tf_decorator
+    # would have side effects.
+    'tensorflow.python.platform.flags.FLAGS'
+}
 _GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
 
 This file is MACHINE GENERATED! Do not edit.
@@ -143,8 +147,8 @@ class _ModuleInitCodeBuilder(object):
     # the script outputs.
     module_text_map[''] = module_text_map.get('', '') + '''
 _names_with_underscore = [%s]
-__all__ = [s for s in dir() if not s.startswith('_')]
-__all__.extend([s for s in _names_with_underscore])
+__all__ = [_s for _s in dir() if not _s.startswith('_')]
+__all__.extend([_s for _s in _names_with_underscore])
 ''' % underscore_names_str
 
     return module_text_map
@@ -177,6 +181,9 @@ def get_api_init_text(package):
       continue
 
     for module_contents_name in dir(module):
+      if (module.__name__ + '.' + module_contents_name
+          in _SYMBOLS_TO_SKIP_EXPLICITLY):
+        continue
       attr = getattr(module, module_contents_name)
 
       # If attr is _tf_api_constants attribute, then add the constants.
@@ -189,7 +196,11 @@ def get_api_init_text(package):
                 -1, dest_module, module.__name__, value, names[-1])
         continue
 
-      _, attr = tf_decorator.unwrap(attr)
+      try:
+        _, attr = tf_decorator.unwrap(attr)
+      except Exception as e:
+        print('5555: %s %s' % (module, module_contents_name), file=sys.stderr)
+        raise e
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
       if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
@@ -204,6 +215,7 @@ def get_api_init_text(package):
   # For e.g. if we import 'foo.bar.Value'. Then, we also
   # import 'bar' in 'foo'.
   imported_modules = set(module_code_builder.module_imports.keys())
+  import_from = '.'
   for module in imported_modules:
     if not module:
       continue
@@ -211,11 +223,9 @@ def get_api_init_text(package):
     parent_module = ''  # we import submodules in their parent_module
 
     for submodule_index in range(len(module_split)):
-      import_from = _OUTPUT_MODULE
       if submodule_index > 0:
         parent_module += ('.' + module_split[submodule_index-1] if parent_module
                           else module_split[submodule_index-1])
-        import_from += '.' + parent_module
       module_code_builder.add_import(
           -1, parent_module, import_from,
           module_split[submodule_index], module_split[submodule_index])
@@ -223,7 +233,24 @@ def get_api_init_text(package):
   return module_code_builder.build()
 
 
-def create_api_files(output_files, package):
+def get_module(dir_path, relative_to_dir):
+  """Get module that corresponds to path relative to relative_to_dir.
+
+  Args:
+    dir_path: Path to directory.
+    relative_to_dir: Get module relative to this directory.
+
+  Returns:
+    module that corresponds to the given directory.
+  """
+  dir_path = dir_path[len(relative_to_dir):]
+  # Convert path separators to '/' for easier parsing below.
+  dir_path = dir_path.replace(os.sep, '/')
+  return dir_path.replace('/', '.').strip('.')
+
+
+def create_api_files(
+    output_files, package, root_init_template, output_dir):
   """Creates __init__.py files for the Python API.
 
   Args:
@@ -231,6 +258,10 @@ def create_api_files(output_files, package):
       Each file must be under api/ directory.
     package: Base python package containing python with target tf_export
       decorators.
+    root_init_template: Template for top-level __init__.py file.
+      "#API IMPORTS PLACEHOLDER" comment in the template file will be replaced
+      with imports.
+    output_dir: output API root directory.
 
   Raises:
     ValueError: if an output file is not under api/ directory,
@@ -238,18 +269,7 @@ def create_api_files(output_files, package):
   """
   module_name_to_file_path = {}
   for output_file in output_files:
-    # Convert path separators to '/' for easier parsing below.
-    normalized_output_file = output_file.replace(os.sep, '/')
-    if _API_DIR not in output_file:
-      raise ValueError(
-          'Output files must be in api/ directory, found %s.' % output_file)
-    # Get the module name that corresponds to output_file.
-    # First get module directory under _API_DIR.
-    module_dir = os.path.dirname(
-        normalized_output_file[
-            normalized_output_file.rfind(_API_DIR)+len(_API_DIR):])
-    # Convert / to .
-    module_name = module_dir.replace('/', '.').strip('.')
+    module_name = get_module(os.path.dirname(output_file), output_dir)
     module_name_to_file_path[module_name] = os.path.normpath(output_file)
 
   # Create file for each expected output in genrule.
@@ -265,12 +285,20 @@ def create_api_files(output_files, package):
   for module, text in module_text_map.items():
     # Make sure genrule output file list is in sync with API exports.
     if module not in module_name_to_file_path:
-      module_file_path = '"api/%s/__init__.py"' %  (
+      module_file_path = '"%s/__init__.py"' %  (
           module.replace('.', '/'))
       missing_output_files.append(module_file_path)
       continue
+    contents = ''
+    if module or not root_init_template:
+      contents = _GENERATED_FILE_HEADER + text
+    else:
+      # Read base init file
+      with open(root_init_template, 'r') as root_init_template_file:
+        contents = root_init_template_file.read()
+        contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
     with open(module_name_to_file_path[module], 'w') as fp:
-      fp.write(_GENERATED_FILE_HEADER + text)
+      fp.write(contents)
 
   if missing_output_files:
     raise ValueError(
@@ -292,6 +320,16 @@ def main():
       '--package', default=_DEFAULT_PACKAGE, type=str,
       help='Base package that imports modules containing the target tf_export '
            'decorators.')
+  parser.add_argument(
+      '--root_init_template', default='', type=str,
+      help='Template for top level __init__.py file. '
+           '"#API IMPORTS PLACEHOLDER" comment will be replaced with imports.')
+  parser.add_argument(
+      '--apidir', type=str, required=True,
+      help='Directory where generated output files are placed. '
+           'gendir should be a prefix of apidir. Also, apidir '
+           'should be a prefix of every directory in outputs.')
+
   args = parser.parse_args()
 
   if len(args.outputs) == 1:
@@ -304,7 +342,8 @@ def main():
 
   # Populate `sys.modules` with modules containing tf_export().
   importlib.import_module(args.package)
-  create_api_files(outputs, args.package)
+  create_api_files(
+      outputs, args.package, args.root_init_template, args.apidir)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/user_ops/BUILD b/tensorflow/user_ops/BUILD
deleted file mode 100644
index 71443cc41e..0000000000
--- a/tensorflow/user_ops/BUILD
+++ /dev/null
@@ -1,52 +0,0 @@
-# Description:
-# An example for custom op and kernel defined as a TensorFlow plugin.
-
-package(
-    default_visibility = ["//tensorflow:internal"],
-)
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(["LICENSE"])
-
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-
-tf_custom_op_library(
-    name = "ackermann_op.so",
-    srcs = ["ackermann_op.cc"],
-)
-
-tf_py_test(
-    name = "ackermann_test",
-    size = "small",
-    srcs = ["ackermann_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-    data = [":ackermann_op.so"],
-)
-
-tf_custom_op_library(
-    name = "duplicate_op.so",
-    srcs = ["duplicate_op.cc"],
-)
-
-tf_py_test(
-    name = "duplicate_op_test",
-    size = "small",
-    srcs = ["duplicate_op_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-    data = [":duplicate_op.so"],
-)
-
-tf_custom_op_library(
-    name = "invalid_op.so",
-    srcs = ["invalid_op.cc"],
-)
-
-tf_py_test(
-    name = "invalid_op_test",
-    size = "small",
-    srcs = ["invalid_op_test.py"],
-    additional_deps = ["//tensorflow:tensorflow_py"],
-    data = [":invalid_op.so"],
-)
-- 
GitLab


From b3adb58d84ebb91d893b647ab4081530460fb8ed Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Thu, 31 May 2018 13:22:10 -0700
Subject: [PATCH 112/610] More eager notebooks.

PiperOrigin-RevId: 198768912
---
 .../notebooks/3_training_models.ipynb         |  54 +-
 .../examples/notebooks/4_high_level.ipynb     | 551 ++++++++++++++++++
 2 files changed, 599 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb

diff --git a/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb
index d9a9bffbb4..84f1d031d4 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/3_training_models.ipynb
@@ -54,11 +54,41 @@
       "source": [
         "## Variables\n",
         "\n",
-        "Neural networks are characterized by a set of parameters (sometimes called \"weights\", sometimes called \"variables\") with fixed shapes and types, where the actual values are computed and adjusted during the training process. The `tfe.Variable` object encapsulates such parameters.\n",
-        "\n",
-        "Recall that `Tensor` objects are immutable, i.e., the underlying value of the `Tensor` cannot be changed. `Variable` objects act like `Tensor`s but are mutable via calls to `assign`, `assign_add` etc.\n",
+        "Tensors in TensorFlow are immutable stateless objects. Machine learning models, however, need to have changing state: as your model trains, the same code to compute predictions should behave differently over time (hopefully with a lower loss!). To represent this state which needs to change over the course of your computation, you can choose to rely on the fact that Python is a stateful programming language:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "VkJwtLS_Jbn8"
+      },
+      "outputs": [],
+      "source": [
+        "# Using python state\n",
+        "x = tf.zeros([10, 10])\n",
+        "x += 2  # This is equivalent to x = x + 2, which does not mutate the original\n",
+        "        # value of x\n",
+        "print(x)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wfneTXy7JcUz"
+      },
+      "source": [
+        "TensorFlow, however, has stateful operations built in, and these are often more pleasant to use than low-level Python representations of your state. To represent weights in a model, for example, it's often convenient and efficient to use TensorFlow variables.\n",
         "\n",
-        "For example:"
+        "A Variable is an object which stores a value and, when used in a TensorFlow computation, will implicitly read from this stored value. There are operations (`tf.assign_sub`, `tf.scatter_update`, etc) which manipulate the value stored in a TensorFlow variable."
       ]
     },
     {
@@ -88,6 +118,18 @@
         "assert v.numpy() == 9.0"
       ]
     },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "-paSaeq1JzwC"
+      },
+      "source": [
+        "Computations using Variables are automatically traced when computing gradients. For Variables representing embeddings TensorFlow will do sparse updates by default, which are more computation and memory efficient.\n",
+        "\n",
+        "Using Variables is also a way to quickly let a reader of your code know that this piece of state is mutable."
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -228,7 +270,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 6,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
@@ -331,7 +373,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": 0,
       "metadata": {
         "colab": {
           "autoexec": {
diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
new file mode 100644
index 0000000000..4fe3a0e3f3
--- /dev/null
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -0,0 +1,551 @@
+{
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "pwX7Fii1rwsJ"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "tf.enable_eager_execution()\n",
+        "tfe = tf.contrib.eager\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "UEu3q4jmpKVT"
+      },
+      "source": [
+        "# High level API\n",
+        "\n",
+        "We recommend using `tf.keras` as a high-level API for building neural networks. That said, most TensorFlow APIs are usable with eager execution.\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "zSFfVVjkrrsI"
+      },
+      "source": [
+        "## Layers: common sets of useful operations\n",
+        "\n",
+        "Most of the time when writing code for machine learning models you want to operate at a higher level of abstraction than individual operations and manipulation of individual variables.\n",
+        "\n",
+        "Many machine learning models are expressible as the composition and stacking of relatively simple layers, and TensorFlow provides both a set of many common layers as a well as easy ways for you to write your own application-specific layers either from scratch or as the composition of existing layers.\n",
+        "\n",
+        "TensorFlow includes the full [Keras](https://keras.io) API in the tf.keras package, and the Keras layers are very useful when building your own models.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          }
+        },
+        "colab_type": "code",
+        "id": "8PyXlPl-4TzQ"
+      },
+      "outputs": [],
+      "source": [
+        "# In the tf.keras.layers package, layers are objects. To construct a layer,\n",
+        "# simply construct the object. Most layers take as a first argument the number\n",
+        "# of output dimensions / channels.\n",
+        "layer = tf.keras.layers.Dense(100)\n",
+        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# the first time the layer is used, but it can be provided if you want to \n",
+        "# specify it manually, which is useful in some complex models.\n",
+        "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Fn69xxPO5Psr"
+      },
+      "source": [
+        "The full list of pre-existing layers can be seen in [the documentation](https://www.tensorflow.org/api_docs/python/tf/keras/layers). It includes Dense (a fully-connected layer),\n",
+        "Conv2D, LSTM, BatchNormalization, Dropout, and many others."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 204
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 244,
+          "status": "ok",
+          "timestamp": 1527783641557,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "E3XKNknP5Mhb",
+        "outputId": "c5d52434-d980-4488-efa7-5660819d0207"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctf.Tensor: id=30, shape=(10, 10), dtype=float32, numpy=\n",
+              "array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],\n",
+              "       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]], dtype=float32)\u003e"
+            ]
+          },
+          "execution_count": 3,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# To use a layer, simply call it.\n",
+        "layer(tf.zeros([10, 5]))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 221
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 320,
+          "status": "ok",
+          "timestamp": 1527783642457,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "Wt_Nsv-L5t2s",
+        "outputId": "f0d96dce-0128-4080-bfe2-0ee6fbc0ad90"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[\u003ctf.Variable 'dense_1/kernel:0' shape=(5, 10) dtype=float32, numpy=\n",
+              " array([[ 0.43788117, -0.62099844, -0.30525017, -0.59352523,  0.1783089 ,\n",
+              "          0.47078604, -0.23620895, -0.30482283,  0.01366901, -0.1288507 ],\n",
+              "        [ 0.18407935, -0.56550485,  0.54180616, -0.42254075,  0.3702994 ,\n",
+              "          0.36705834, -0.29678228,  0.36660975,  0.36717761,  0.46269661],\n",
+              "        [ 0.1709305 , -0.11529458,  0.32710236,  0.46300393, -0.62802851,\n",
+              "          0.51641601,  0.39624029,  0.26918125, -0.25196898,  0.21353298],\n",
+              "        [ 0.35752094,  0.44161648,  0.61500639, -0.12653333,  0.41629118,\n",
+              "          0.36193585,  0.066082  , -0.59253877,  0.47318751,  0.17115968],\n",
+              "        [-0.22554061, -0.17727301,  0.5525015 ,  0.3678053 , -0.00454676,\n",
+              "          0.24066836, -0.53640735,  0.13792562, -0.10727292,  0.59708995]], dtype=float32)\u003e,\n",
+              " \u003ctf.Variable 'dense_1/bias:0' shape=(10,) dtype=float32, numpy=array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)\u003e]"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# Layers have many useful methods. For example, you can inspect all variables\n",
+        "# in a layer by calling layer.variables. In this case a fully-connected layer\n",
+        "# will have variables for weights and biases.\n",
+        "layer.variables"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 221
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 226,
+          "status": "ok",
+          "timestamp": 1527783643252,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "6ilvKjz8_4MQ",
+        "outputId": "f647fced-c2d7-41a3-c237-242036784665"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "(\u003ctf.Variable 'dense_1/kernel:0' shape=(5, 10) dtype=float32, numpy=\n",
+              " array([[ 0.43788117, -0.62099844, -0.30525017, -0.59352523,  0.1783089 ,\n",
+              "          0.47078604, -0.23620895, -0.30482283,  0.01366901, -0.1288507 ],\n",
+              "        [ 0.18407935, -0.56550485,  0.54180616, -0.42254075,  0.3702994 ,\n",
+              "          0.36705834, -0.29678228,  0.36660975,  0.36717761,  0.46269661],\n",
+              "        [ 0.1709305 , -0.11529458,  0.32710236,  0.46300393, -0.62802851,\n",
+              "          0.51641601,  0.39624029,  0.26918125, -0.25196898,  0.21353298],\n",
+              "        [ 0.35752094,  0.44161648,  0.61500639, -0.12653333,  0.41629118,\n",
+              "          0.36193585,  0.066082  , -0.59253877,  0.47318751,  0.17115968],\n",
+              "        [-0.22554061, -0.17727301,  0.5525015 ,  0.3678053 , -0.00454676,\n",
+              "          0.24066836, -0.53640735,  0.13792562, -0.10727292,  0.59708995]], dtype=float32)\u003e,\n",
+              " \u003ctf.Variable 'dense_1/bias:0' shape=(10,) dtype=float32, numpy=array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.], dtype=float32)\u003e)"
+            ]
+          },
+          "execution_count": 5,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# The variables are also accessible through nice accessors\n",
+        "layer.kernel, layer.bias"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "O0kDbE54-5VS"
+      },
+      "source": [
+        "## Implementing custom layers\n",
+        "The best way to implement your own layer is extending the tf.keras.Layer class and implementing:\n",
+        "  *  `__init__` , where you can do all input-independent initialization\n",
+        "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
+        "  * `call`, where you do the forward computation\n",
+        "\n",
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 391
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 251,
+          "status": "ok",
+          "timestamp": 1527783661512,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "5Byl3n1k5kIy",
+        "outputId": "6e7f9285-649a-4132-82ce-73ea92f15862"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tf.Tensor(\n",
+            "[[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]\n",
+            " [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]], shape=(10, 10), dtype=float32)\n",
+            "[\u003ctf.Variable 'my_dense_layer_1/kernel:0' shape=(5, 10) dtype=float32, numpy=\n",
+            "array([[-0.4011991 ,  0.22458655, -0.33237562, -0.25117266,  0.33528614,\n",
+            "        -0.01392961,  0.58580834, -0.16346583,  0.28465688, -0.47191954],\n",
+            "       [-0.52922136,  0.22416979, -0.58209574, -0.60914612,  0.05226624,\n",
+            "        -0.18325993,  0.5591442 , -0.24718609,  0.37148207,  0.40475875],\n",
+            "       [ 0.16912812, -0.47618777, -0.38989353,  0.30105609, -0.08085585,\n",
+            "         0.44758242,  0.545829  ,  0.51421839,  0.11063248,  0.20159996],\n",
+            "       [ 0.34073615, -0.59835428,  0.06498981, -0.44489855, -0.34302285,\n",
+            "         0.20969599,  0.35527444, -0.03173476, -0.22227573,  0.09303057],\n",
+            "       [ 0.41764337, -0.06435019, -0.52509922, -0.39957345,  0.56811184,\n",
+            "         0.23481232, -0.61666459,  0.31144124, -0.11532354, -0.42421889]], dtype=float32)\u003e]\n"
+          ]
+        }
+      ],
+      "source": [
+        "class MyDenseLayer(tf.keras.layers.Layer):\n",
+        "  def __init__(self, num_outputs):\n",
+        "    super(MyDenseLayer, self).__init__()\n",
+        "    self.num_outputs = num_outputs\n",
+        "    \n",
+        "  def build(self, input_shape):\n",
+        "    self.kernel = self.add_variable(\"kernel\", \n",
+        "                                    shape=[input_shape[-1].value, \n",
+        "                                           self.num_outputs])\n",
+        "    \n",
+        "  def call(self, input):\n",
+        "    return tf.matmul(input, self.kernel)\n",
+        "  \n",
+        "layer = MyDenseLayer(10)\n",
+        "print(layer(tf.zeros([10, 5])))\n",
+        "print(layer.variables)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "tk8E2vY0-z4Z"
+      },
+      "source": [
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`.\n",
+        "\n",
+        "Overall code is easier to read and maintain if it uses standard layers whenever possible, as other readers will be familiar with the behavior of standard layers. If you want to use a layer which is not present in tf.keras.layers or tf.contrib.layers, consider filing a [github issue](http://github.com/tensorflow/tensorflow/issues/new) or, even better, sending us a pull request!"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "Qhg4KlbKrs3G"
+      },
+      "source": [
+        "## Models: composing layers\n",
+        "\n",
+        "Many interesting layer-like things in machine learning models are implemented by composing existing layers. For example, each residual block in a resnet is a composition of convolutions, batch normalizations, and a shortcut.\n",
+        "\n",
+        "The main class used when creating a layer-like thing which contains other layers is tf.keras.Model. Implementing one is done by inheriting from tf.keras.Model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "height": 190
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 420,
+          "status": "ok",
+          "timestamp": 1527783698512,
+          "user": {
+            "displayName": "",
+            "photoUrl": "",
+            "userId": ""
+          },
+          "user_tz": 420
+        },
+        "id": "N30DTXiRASlb",
+        "outputId": "a8b23a8e-5cf9-4bbf-f93b-6c763d74e2b3"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "tf.Tensor(\n",
+            "[[[[ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]]\n",
+            "\n",
+            "  [[ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]\n",
+            "   [ 0.  0.  0.]]]], shape=(1, 2, 3, 3), dtype=float32)\n",
+            "['resnet_identity_block_1/conv2d_3/kernel:0', 'resnet_identity_block_1/conv2d_3/bias:0', 'resnet_identity_block_1/batch_normalization_3/gamma:0', 'resnet_identity_block_1/batch_normalization_3/beta:0', 'resnet_identity_block_1/conv2d_4/kernel:0', 'resnet_identity_block_1/conv2d_4/bias:0', 'resnet_identity_block_1/batch_normalization_4/gamma:0', 'resnet_identity_block_1/batch_normalization_4/beta:0', 'resnet_identity_block_1/conv2d_5/kernel:0', 'resnet_identity_block_1/conv2d_5/bias:0', 'resnet_identity_block_1/batch_normalization_5/gamma:0', 'resnet_identity_block_1/batch_normalization_5/beta:0', 'resnet_identity_block_1/batch_normalization_3/moving_mean:0', 'resnet_identity_block_1/batch_normalization_3/moving_variance:0', 'resnet_identity_block_1/batch_normalization_4/moving_mean:0', 'resnet_identity_block_1/batch_normalization_4/moving_variance:0', 'resnet_identity_block_1/batch_normalization_5/moving_mean:0', 'resnet_identity_block_1/batch_normalization_5/moving_variance:0']\n"
+          ]
+        }
+      ],
+      "source": [
+        "class ResnetIdentityBlock(tf.keras.Model):\n",
+        "  def __init__(self, kernel_size, filters):\n",
+        "    super(ResnetIdentityBlock, self).__init__(name='')\n",
+        "    filters1, filters2, filters3 = filters\n",
+        "\n",
+        "    self.conv2a = tf.keras.layers.Conv2D(filters1, (1, 1))\n",
+        "    self.bn2a = tf.keras.layers.BatchNormalization()\n",
+        "\n",
+        "    self.conv2b = tf.keras.layers.Conv2D(filters2, kernel_size, padding='same')\n",
+        "    self.bn2b = tf.keras.layers.BatchNormalization()\n",
+        "\n",
+        "    self.conv2c = tf.keras.layers.Conv2D(filters3, (1, 1))\n",
+        "    self.bn2c = tf.keras.layers.BatchNormalization()\n",
+        "\n",
+        "  def call(self, input_tensor, training=False):\n",
+        "    x = self.conv2a(input_tensor)\n",
+        "    x = self.bn2a(x, training=training)\n",
+        "    x = tf.nn.relu(x)\n",
+        "\n",
+        "    x = self.conv2b(x)\n",
+        "    x = self.bn2b(x, training=training)\n",
+        "    x = tf.nn.relu(x)\n",
+        "\n",
+        "    x = self.conv2c(x)\n",
+        "    x = self.bn2c(x, training=training)\n",
+        "\n",
+        "    x += input_tensor\n",
+        "    return tf.nn.relu(x)\n",
+        "\n",
+        "    \n",
+        "block = ResnetIdentityBlock(1, [1, 2, 3])\n",
+        "print(block(tf.zeros([1, 2, 3, 3])))\n",
+        "print([x.name for x in block.variables])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "wYfucVw65PMj"
+      },
+      "source": [
+        "Much of the time, however, models which compose many layers simply call one layer after the other. This can be done in very little code using tf.keras.Sequential"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 0,
+      "metadata": {
+        "colab": {
+          "autoexec": {
+            "startup": false,
+            "wait_interval": 0
+          },
+          "base_uri": "https://localhost:8080/",
+          "height": 153
+        },
+        "colab_type": "code",
+        "executionInfo": {
+          "elapsed": 361,
+          "status": "ok",
+          "timestamp": 1526674830777,
+          "user": {
+            "displayName": "Alexandre Passos",
+            "photoUrl": "//lh4.googleusercontent.com/-kmTTWXEgAPw/AAAAAAAAAAI/AAAAAAAAAC0/q_DoOzKGwds/s50-c-k-no/photo.jpg",
+            "userId": "108023195365833072773"
+          },
+          "user_tz": 420
+        },
+        "id": "L9frk7Ur4uvJ",
+        "outputId": "882e9076-b6d9-4380-bb1e-7c6b57d54c39"
+      },
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "\u003ctf.Tensor: id=1423, shape=(1, 2, 3, 3), dtype=float32, numpy=\n",
+              "array([[[[0., 0., 0.],\n",
+              "         [0., 0., 0.],\n",
+              "         [0., 0., 0.]],\n",
+              "\n",
+              "        [[0., 0., 0.],\n",
+              "         [0., 0., 0.],\n",
+              "         [0., 0., 0.]]]], dtype=float32)\u003e"
+            ]
+          },
+          "execution_count": 26,
+          "metadata": {
+            "tags": []
+          },
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        " my_seq = tf.keras.Sequential([tf.keras.layers.Conv2D(1, (1, 1)),\n",
+        "                               tf.keras.layers.BatchNormalization(),\n",
+        "                               tf.keras.layers.Conv2D(2, 1, \n",
+        "                                                      padding='same'),\n",
+        "                               tf.keras.layers.BatchNormalization(),\n",
+        "                               tf.keras.layers.Conv2D(3, (1, 1)),\n",
+        "                               tf.keras.layers.BatchNormalization()])\n",
+        "my_seq(tf.zeros([1, 2, 3, 3]))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "colab_type": "text",
+        "id": "c5YwYcnuK-wc"
+      },
+      "source": [
+        "# Next steps\n",
+        "\n",
+        "Now you can go back to the previous notebook and adapt the linear regression example to use layers and models to be better structured."
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "default_view": {},
+      "name": "4 - High level API - TensorFlow Eager.ipynb",
+      "provenance": [],
+      "version": "0.3.2",
+      "views": {}
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
-- 
GitLab


From 89a55fef3316e0e270e0f87f71bd8c2d32443cc8 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 31 May 2018 13:43:43 -0700
Subject: [PATCH 113/610] [tf.data] Changing signature of `MakeIterator` to
 enable propagating error status.

PiperOrigin-RevId: 198772254
---
 .../contrib/data/kernels/csv_dataset_op.cc    |  2 +-
 .../kernels/directed_interleave_dataset_op.cc | 24 ++++++++++-------
 .../data/kernels/ignore_errors_dataset_op.cc  |  9 ++++---
 .../data/kernels/threadpool_dataset_op.cc     |  9 ++++---
 .../contrib/data/kernels/unique_dataset_op.cc |  9 ++++---
 .../kafka/kernels/kafka_dataset_ops.cc        |  2 +-
 tensorflow/core/framework/dataset.h           | 16 ++++++++---
 .../core/kernels/data/batch_dataset_op.cc     |  9 ++++---
 .../core/kernels/data/cache_dataset_ops.cc    | 14 +++++++---
 .../kernels/data/concatenate_dataset_op.cc    | 20 +++++++-------
 tensorflow/core/kernels/data/dataset_utils.cc |  5 ++--
 .../data/dense_to_sparse_batch_dataset_op.cc  | 10 ++++---
 .../core/kernels/data/filter_dataset_op.cc    |  9 ++++---
 .../core/kernels/data/flat_map_dataset_op.cc  | 12 ++++++---
 .../core/kernels/data/generator_dataset_op.cc |  2 +-
 .../data/group_by_reducer_dataset_op.cc       |  9 ++++---
 .../data/group_by_window_dataset_op.cc        | 13 +++++----
 .../kernels/data/interleave_dataset_op.cc     |  9 ++++---
 tensorflow/core/kernels/data/iterator_ops.cc  | 26 +++++++++++++-----
 .../kernels/data/map_and_batch_dataset_op.cc  |  9 ++++---
 .../core/kernels/data/map_dataset_op.cc       | 11 +++++---
 .../kernels/data/padded_batch_dataset_op.cc   | 12 ++++++---
 .../data/parallel_interleave_dataset_op.cc    |  7 +++--
 .../kernels/data/parallel_map_dataset_op.cc   |  7 +++--
 .../core/kernels/data/prefetch_dataset_op.cc  |  9 ++++---
 .../core/kernels/data/random_dataset_op.cc    |  2 +-
 .../core/kernels/data/range_dataset_op.cc     |  2 +-
 .../core/kernels/data/reader_dataset_ops.cc   |  6 ++---
 .../core/kernels/data/repeat_dataset_op.cc    | 19 ++++++++-----
 .../core/kernels/data/scan_dataset_op.cc      |  9 ++++---
 .../core/kernels/data/shuffle_dataset_op.cc   | 15 ++++++-----
 .../core/kernels/data/skip_dataset_op.cc      | 13 +++++----
 .../core/kernels/data/slide_dataset_op.cc     | 27 ++++++++++++-------
 .../data/sparse_tensor_slice_dataset_op.cc    |  2 +-
 .../core/kernels/data/sql_dataset_ops.cc      |  2 +-
 .../data/stats_aggregator_dataset_op.cc       |  9 ++++---
 .../core/kernels/data/stats_dataset_ops.cc    | 18 ++++++++-----
 .../core/kernels/data/take_dataset_op.cc      | 17 ++++++------
 .../core/kernels/data/tensor_dataset_op.cc    |  2 +-
 .../kernels/data/tensor_queue_dataset_op.cc   | 23 +++++++++-------
 .../kernels/data/tensor_slice_dataset_op.cc   |  2 +-
 .../core/kernels/data/unbatch_dataset_op.cc   |  7 +++--
 .../core/kernels/data/window_dataset.cc       |  2 +-
 tensorflow/core/kernels/data/writer_ops.cc    |  8 ++++--
 .../core/kernels/data/zip_dataset_op.cc       | 17 +++++++-----
 45 files changed, 295 insertions(+), 171 deletions(-)

diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index 76e54a284e..b16e66258b 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -133,7 +133,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           delim_(delim),
           na_value_(std::move(na_value)) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::CSV")}));
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
index 48d3734162..bdff379bfa 100644
--- a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
@@ -91,7 +91,7 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::DirectedInterleave")}));
@@ -130,15 +130,21 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            selector_input_impl_(params.dataset->selector_input_->MakeIterator(
-                params.prefix + ".selector")),
-            num_active_inputs_(params.dataset->data_inputs_.size()) {
-        data_input_impls_.reserve(params.dataset->data_inputs_.size());
-        for (size_t i = 0; i < params.dataset->data_inputs_.size(); ++i) {
-          const DatasetBase* data_input = params.dataset->data_inputs_[i];
-          data_input_impls_.push_back(data_input->MakeIterator(
-              strings::StrCat(params.prefix, "[", i, "]")));
+            num_active_inputs_(params.dataset->data_inputs_.size()) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(dataset()->selector_input_->MakeIterator(
+            ctx, strings::StrCat(prefix(), ".selector"),
+            &selector_input_impl_));
+        data_input_impls_.resize(dataset()->data_inputs_.size());
+        for (size_t i = 0; i < data_input_impls_.size(); ++i) {
+          const DatasetBase* data_input = dataset()->data_inputs_[i];
+          TF_RETURN_IF_ERROR(data_input->MakeIterator(
+              ctx, strings::StrCat(prefix(), "[", i, "]"),
+              &data_input_impls_[i]));
         }
+        return Status::OK();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
diff --git a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
index bb29df60e8..c3759b68d9 100644
--- a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
@@ -44,7 +44,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::IgnoreErrors")}));
@@ -72,8 +72,11 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
index 63e19ae3f8..7cf01f6a07 100644
--- a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
@@ -127,7 +127,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       threadpool_->Unref();
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::ThreadPool")}));
@@ -154,8 +154,11 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/contrib/data/kernels/unique_dataset_op.cc b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
index 69fbb0fcdc..652913d6b2 100644
--- a/tensorflow/contrib/data/kernels/unique_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
@@ -56,7 +56,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Unique")}));
@@ -87,8 +87,11 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const typename Iterator::Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index a4cd4a2cc4..7b08cfa095 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -64,7 +64,7 @@ class KafkaDatasetOp : public DatasetOpKernel {
           eof_(eof),
           timeout_(timeout) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Kafka")}));
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 8624af9bf5..0f352ea559 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -351,6 +351,10 @@ class IteratorBase {
   // in the outputs of this iterator.
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
+  // Performs initialization that needs to happen outside of a constructor to
+  // properly propagate errors.
+  virtual Status Initialize(IteratorContext* ctx) { return Status::OK(); }
+
   // Saves the state of this iterator.
   virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) {
     return SaveInternal(writer);
@@ -402,12 +406,13 @@ class DatasetBase : public core::RefCounted {
   // iterator will traverse all elements in this dataset from the
   // start.
   //
-  // Ownership of the created iterator will be transferred to the caller.
-  //
   // The prefix identifies the sequence of iterators leading up to the newly
   // created iterator.
-  virtual std::unique_ptr<IteratorBase> MakeIterator(
-      const string& prefix) const = 0;
+  Status MakeIterator(IteratorContext* ctx, const string& prefix,
+                      std::unique_ptr<IteratorBase>* iterator) const {
+    *iterator = MakeIteratorInternal(prefix);
+    return (*iterator)->Initialize(ctx);
+  }
 
   // Returns a vector of DataType values, representing the respective
   // element types of each tuple component in the outputs of this
@@ -451,6 +456,9 @@ class DatasetBase : public core::RefCounted {
                                     Node** node) const {
     return errors::Unimplemented("AsGraphDefInternal");
   }
+
+  virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const string& prefix) const = 0;
 };
 
 // Base-class for datasets that are built by ops.
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 3618c75827..9c0a6b02e8 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -61,7 +61,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           Iterator::Params{this, strings::StrCat(prefix, "::Batch")}));
@@ -95,8 +95,11 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 4b4728dab6..5f7db9ed12 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -64,7 +64,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     ~FileDataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (env_->FileExists(strings::StrCat(filename_, ".index")).ok()) {
         return std::unique_ptr<IteratorBase>(new FileReaderIterator(
@@ -106,12 +106,15 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       explicit FileWriterIterator(const Params& params)
           : DatasetIterator<FileDataset>(params),
             cur_index_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             writer_(params.dataset->env_, params.dataset->filename_),
             lockfile_(strings::StrCat(params.dataset->filename_, ".lockfile")),
             lockfile_created_(false),
             iteration_completed_(false) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -268,7 +271,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     ~MemoryDataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       mutex_lock l(mu_);
       if (cache_) {
@@ -305,7 +308,6 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit MemoryWriterIterator(const Params& params)
           : DatasetIterator<MemoryDataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             cache_(new std::vector<std::vector<Tensor>>) {}
 
       ~MemoryWriterIterator() override {
@@ -323,6 +325,10 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index f11abc62a6..7c9dd1230a 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -61,7 +61,7 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       to_concatenate_->Unref();
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Concatenate")}));
@@ -94,10 +94,12 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(
-                strings::StrCat(params.prefix, "[0]"))) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(
+            ctx, strings::StrCat(prefix(), "[0]"), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -114,8 +116,8 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
             return Status::OK();
           }
           if (++i_ < 2) {
-            input_impl_ = dataset()->to_concatenate_->MakeIterator(
-                strings::StrCat(prefix(), "[1]"));
+            TF_RETURN_IF_ERROR(dataset()->to_concatenate_->MakeIterator(
+                ctx, strings::StrCat(prefix(), "[1]"), &input_impl_));
           }
         }
         *end_of_sequence = true;
@@ -147,8 +149,8 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
         if (!TF_PREDICT_TRUE(i_ >= 0 && i_ <= 2))
           return errors::InvalidArgument("i_ must be in range [0, 2].");
         if (i_ == 1) {
-          input_impl_ = dataset()->to_concatenate_->MakeIterator(
-              strings::StrCat(prefix(), "[1]"));
+          TF_RETURN_IF_ERROR(dataset()->to_concatenate_->MakeIterator(
+              ctx, strings::StrCat(prefix(), "[1]"), &input_impl_));
         } else if (i_ == 2) {
           input_impl_.reset();
         }
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index c608f9e1c6..d85ef1cbab 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -41,9 +41,8 @@ Status MakeIteratorFromInputElement(
       GetDatasetFromVariantTensor(return_values[0], &returned_dataset));
 
   // Create an iterator for the dataset that was returned by `f`.
-  *out_iterator = returned_dataset->MakeIterator(
-      strings::StrCat(prefix, "[", thread_index, "]"));
-  return Status::OK();
+  return returned_dataset->MakeIterator(
+      ctx, strings::StrCat(prefix, "[", thread_index, "]"), out_iterator);
 }
 
 }  // namespace dataset
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index 132808a5f1..28fa77ce06 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -94,7 +94,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::DenseToSparseBatch")}));
@@ -137,8 +137,12 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset<T>> {
      public:
       explicit Iterator(const typename Iterator::Params& params)
-          : DatasetIterator<Dataset<T>>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset<T>>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return DatasetIterator<Dataset<T>>::dataset()->input_->MakeIterator(
+            ctx, DatasetIterator<Dataset<T>>::prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 186b1e1c6c..5760e55e06 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -93,7 +93,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     ~FilterDatasetBase() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Filter")}));
@@ -145,8 +145,11 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<FilterDatasetBase> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<FilterDatasetBase>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<FilterDatasetBase>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 77a48a2aa9..e2edda012a 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -74,7 +74,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::FlatMap")}));
@@ -125,8 +125,11 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -202,7 +205,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
         current_element_iterator_.reset();
         captured_func_inputs_.clear();
         if (!reader->Contains(full_name("exhausted"))) {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
           {
             int64 temp;
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 3f1e441b91..d298389f21 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -99,7 +99,7 @@ class GeneratorDatasetOp : public DatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Generator")}));
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
index c8aeaab9cb..7bbadffc48 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
@@ -88,7 +88,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::GroupByReducer")}));
@@ -183,8 +183,11 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index 03f847ce9c..f9cc5d26b0 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -118,7 +118,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::GroupByWindow")}));
@@ -198,8 +198,11 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -484,8 +487,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             GetDatasetFromVariantTensor(return_values[0], &returned_dataset));
 
         // Create an iterator for the dataset that was returned by `f`.
-        current_group_iterator_ = returned_dataset->MakeIterator(prefix());
-        return Status::OK();
+        return returned_dataset->MakeIterator(ctx, prefix(),
+                                              &current_group_iterator_);
       }
 
       mutex mu_;
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index bce3f28d62..723648b886 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -96,7 +96,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Interleave")}));
@@ -149,10 +149,13 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             current_elements_(params.dataset->cycle_length_),
             args_list_(params.dataset->cycle_length_) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         block_index_ = 0;
         cycle_index_ = (cycle_index_ + 1) % dataset()->cycle_length_;
@@ -294,7 +297,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<std::unique_ptr<IteratorBase>> current_elements_
           GUARDED_BY(mu_);
       std::vector<std::vector<Tensor>> args_list_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 87bc8ebefe..9d9e74adba 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -158,7 +158,10 @@ class IteratorResource : public ResourceBase {
         graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
-    TF_RETURN_IF_ERROR(set_iterator(dataset->MakeIterator("Iterator")));
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iterator;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
     std::shared_ptr<IteratorBase> captured_iterator(iterator_);
 
     if (captured_iterator) {
@@ -657,8 +660,12 @@ class MakeIteratorOp : public OpKernel {
     OP_REQUIRES_OK(
         ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
     core::ScopedUnref unref(iterator_resource);
-    OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(
-                            dataset->MakeIterator("Iterator")));
+
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iterator;
+    OP_REQUIRES_OK(ctx,
+                   dataset->MakeIterator(&iter_ctx, "Iterator", &iterator));
+    OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
   }
 };
 
@@ -680,9 +687,12 @@ class ToSingleElementOp : public AsyncOpKernel {
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      auto iterator = dataset->MakeIterator("SingleElementIterator");
-
       IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+      std::unique_ptr<IteratorBase> iterator;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
+          done);
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
@@ -866,8 +876,10 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    TF_RETURN_IF_ERROR(
-        (*iterator)->set_iterator(dataset->MakeIterator("Iterator")));
+    IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+    std::unique_ptr<IteratorBase> iter;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(&iter_ctx, "Iterator", &iter));
+    TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
 
     (*iterator)->Ref();
     return Status::OK();
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index f41a810b07..f55a66524a 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -125,7 +125,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::MapAndBatch")}));
@@ -188,7 +188,6 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             batch_results_((params.dataset->num_parallel_calls_ +
                             params.dataset->batch_size_ - 1) /
                            params.dataset->batch_size_) {
@@ -208,6 +207,10 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -647,7 +650,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       int64 num_calls_ GUARDED_BY(mu_) = 0;
       // Counts the total number of calls.
       int64 call_counter_ GUARDED_BY(mu_) = 0;
-      const std::unique_ptr<IteratorBase> input_impl_;
+      std::unique_ptr<IteratorBase> input_impl_;
       // Identifies the next batch to be read by the caller.
       int64 input_batch_ GUARDED_BY(mu_) = 0;
       // Identifies the next batch to create.
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 89360d1cd9..40063c8ba9 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -73,7 +73,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Map")}));
@@ -123,8 +123,11 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -167,7 +170,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       }
 
      private:
-      const std::unique_ptr<IteratorBase> input_impl_;
+      std::unique_ptr<IteratorBase> input_impl_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index e41800a806..f60b5472d6 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -119,7 +119,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::PaddedBatch")}));
@@ -186,8 +186,11 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -325,7 +328,8 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         if (reader->Contains(full_name("exhausted"))) {
           input_impl_.reset();
         } else {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         }
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index fa33867ec1..8da6b331a3 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -116,7 +116,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::ParallelInterleave")}));
@@ -236,7 +236,6 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             workers_(dataset()->num_threads()),
             worker_thread_states_(dataset()->num_threads()) {}
 
@@ -249,6 +248,10 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       // It is implemented so that it matches the deterministic interleave
       // unless getting the next element would block and we are allowed to be
       // sloppy.
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 7e373f2568..cf55067e2c 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -85,7 +85,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::ParallelMap")}));
@@ -150,7 +150,6 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             invocation_results_(params.dataset->num_parallel_calls_) {}
 
       ~Iterator() override {
@@ -169,6 +168,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 536de81fd8..140983805a 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -55,7 +55,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Prefetch")}));
@@ -87,7 +87,6 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             auto_tuner_(params.dataset->buffer_size_) {}
 
       ~Iterator() override {
@@ -106,6 +105,10 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
         }
       }
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -327,7 +330,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       // accessing the parent iterator. We keep this separate from `mu_` to
       // allow prefetching to run in parallel with GetNext calls.
       mutex parent_mu_ ACQUIRED_BEFORE(mu_);
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(parent_mu_);
       condition_variable cond_var_;
       PrefetchAutotuner auto_tuner_ GUARDED_BY(mu_);
       std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index 210b9ad1b8..40bd95e4e7 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -54,7 +54,7 @@ class RandomDatasetOp : public DatasetOpKernel {
     Dataset(OpKernelContext* ctx, int64 seed, int64 seed2)
         : GraphDatasetBase(ctx), seed_(seed), seed2_(seed2) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Random")}));
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index b57518e678..b18263b613 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -48,7 +48,7 @@ class RangeDatasetOp : public DatasetOpKernel {
     Dataset(OpKernelContext* ctx, int64 start, int64 stop, int64 step)
         : GraphDatasetBase(ctx), start_(start), stop_(stop), step_(step) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Range")}));
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 34d7d9f914..28d38d49eb 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -89,7 +89,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
           use_compression_(!compression_type.empty()),
           options_(options) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::TextLine")}));
@@ -323,7 +323,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
           footer_bytes_(footer_bytes),
           buffer_size_(buffer_size) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::FixedLengthRecord")}));
@@ -543,7 +543,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::TFRecord")}));
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index d37086541d..fcd9820785 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -48,7 +48,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
         return std::unique_ptr<IteratorBase>(new ForeverIterator(
@@ -108,9 +108,11 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
     class FiniteIterator : public DatasetIterator<Dataset> {
      public:
       explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -127,7 +129,8 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
           ++i_;
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
         }
         *end_of_sequence = true;
         input_impl_.reset();
@@ -178,7 +181,8 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
           bool first_call = false;
           if (!input_impl_) {
             first_call = true;
-            input_impl_ = dataset()->input_->MakeIterator(prefix());
+            TF_RETURN_IF_ERROR(
+                dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           }
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -214,7 +218,8 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
         if (reader->Contains(full_name("uninitialized"))) {
           input_impl_.reset();
         } else {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         }
         return Status::OK();
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
index 5dd6ff848e..972ed8fb00 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -90,7 +90,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Scan")}));
@@ -149,9 +149,12 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
      public:
       explicit Iterator(const Params& params)
           : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             state_(params.dataset->initial_state_) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
@@ -250,7 +253,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
      private:
       mutex mu_;
-      const std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<Tensor> state_ GUARDED_BY(mu_);
     };
 
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 2f6bf83da5..dad58efe73 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -85,7 +85,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         bool first_call = false;
         if (!input_impl_ && epoch_ == 0) {
           first_call = true;
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
         }
         while (input_impl_ && num_elements_ < dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
@@ -114,7 +115,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             epoch_++;
             int64 n = slices_.back()->end;
             slices_.emplace_back(new Slice{n, n});
-            input_impl_ = dataset()->input_->MakeIterator(prefix());
+            TF_RETURN_IF_ERROR(
+                dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           }
           if (!end_of_input_sequence) {
             buffer_[slices_.back()->end % dataset()->buffer_size_] =
@@ -211,7 +213,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
 
         // Restore the input iterator if it wasn't already exhausted.
         if (!reader->Contains(full_name("end_of_input_sequence"))) {
-          input_impl_ = dataset()->input_->MakeIterator(prefix());
+          TF_RETURN_IF_ERROR(
+              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
@@ -361,7 +364,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                              ", ", seed2_, ")::ReshufflingDataset");
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       int64 iterator_seed;
       int64 iterator_seed2;
@@ -399,7 +402,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                              ", ", seed2_, ")::FixedSeedDataset");
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
           {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
@@ -482,7 +485,7 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
                              seed_, ", ", seed2_, ", ", count_, ")::Dataset");
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
           {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index d636c37afe..0177839707 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -47,14 +47,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       if (count_ < 0) {
         return std::unique_ptr<IteratorBase>(
             new EmptyIterator({this, strings::StrCat(prefix, "::EmptySkip")}));
-      } else if (count_ == 0) {
-        // Pass through.
-        return input_->MakeIterator(prefix);
       } else {
         return std::unique_ptr<IteratorBase>(new FiniteIterator(
             {this, strings::StrCat(prefix, "::FiniteSkip")}));
@@ -108,9 +105,11 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
     class FiniteIterator : public DatasetIterator<Dataset> {
      public:
       explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index 78c8363f91..e4b2820445 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -33,10 +33,9 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
                    DatasetBase** output) override {
     int64 window_size = 0;
     int64 stride = 1;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "window_size", &window_size));
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "stride", &stride));
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "window_size", &window_size));
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<int64>(ctx, "stride", &stride));
     OP_REQUIRES(
         ctx, window_size > 0,
         errors::InvalidArgument("Window size must be greater than zero."));
@@ -50,8 +49,12 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
  private:
   class Dataset : public GraphDatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, int64 window_size, int64 stride, const DatasetBase* input)
-        : GraphDatasetBase(ctx), window_size_(window_size), stride_(stride), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 window_size, int64 stride,
+            const DatasetBase* input)
+        : GraphDatasetBase(ctx),
+          window_size_(window_size),
+          stride_(stride),
+          input_(input) {
       input_->Ref();
 
       const auto& input_shapes = input_->output_shapes();
@@ -64,7 +67,7 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           Iterator::Params{this, strings::StrCat(prefix, "::Slide")}));
@@ -79,7 +82,8 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
     }
 
     string DebugString() override {
-      return strings::StrCat("SlideDatasetOp(", window_size_, ", ", stride_, ")::Dataset");
+      return strings::StrCat("SlideDatasetOp(", window_size_, ", ", stride_,
+                             ")::Dataset");
     }
 
    protected:
@@ -101,8 +105,11 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index fcf17ad68b..4cc638b4cf 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -39,7 +39,7 @@ class Dataset : public GraphDatasetBase {
                  {-1},
                  {sparse_tensor.dims() - 1}}) {}
 
-  std::unique_ptr<IteratorBase> MakeIterator(
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(
         new Iterator({this, strings::StrCat(prefix, "::SparseTensorSlice")}));
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index 634b3c280f..4742ed30cf 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -88,7 +88,7 @@ class SqlDatasetOp : public DatasetOpKernel {
           output_types_(output_types),
           output_shapes_(output_shapes) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Sql")}));
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index eb96b8a872..fd490c7c17 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -53,7 +53,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       stats_aggregator_resource_->Unref();
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::SetStatsAggregator")}));
@@ -82,8 +82,11 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 633cd85451..8dc76185bc 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -56,7 +56,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::LatencyStats")}));
@@ -86,8 +86,11 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -150,7 +153,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(new Iterator(
           {this, strings::StrCat(prefix, "::BytesProducedStats")}));
@@ -182,8 +185,11 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 3bea46a747..209207d742 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -47,12 +47,9 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override { input_->Unref(); }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      if (count_ < 0) {
-        // Pass through
-        return input_->MakeIterator(prefix);
-      } else if (count_ == 0) {
+      if (count_ == 0) {
         return std::unique_ptr<IteratorBase>(
             new EmptyIterator({this, strings::StrCat(prefix, "::EmptyTake")}));
       } else {
@@ -109,9 +106,11 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
     class FiniteIterator : public DatasetIterator<Dataset> {
      public:
       explicit FiniteIterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            i_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)) {}
+          : DatasetIterator<Dataset>(params), i_(0) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -121,7 +120,7 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        while (i_ < dataset()->count_) {
+        while (dataset()->count_ < 0 || i_ < dataset()->count_) {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
           if (!*end_of_sequence) {
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 8c8994b1c3..8f4586b5b6 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -53,7 +53,7 @@ class TensorDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::FromTensor")}));
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
index e271a42b2a..e9f486d867 100644
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -81,7 +81,7 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
 
   ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
 
-  std::unique_ptr<IteratorBase> MakeIterator(
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(new Iterator(
         {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
@@ -152,15 +152,19 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
       : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params),
-          queue_(new TensorQueue(/*input_impl*/
-                                 params.dataset->input_->MakeIterator(
-                                     params.prefix),
-                                 params.dataset->dtypes_,
-                                 params.dataset->shapes_)) {}
+        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params) {}
 
     ~Iterator() override { queue_->Unref(); }
 
+    Status Initialize(IteratorContext* ctx) override {
+      std::unique_ptr<IteratorBase> iterator;
+      TF_RETURN_IF_ERROR(
+          dataset()->input_->MakeIterator(ctx, prefix(), &iterator));
+      queue_ = new TensorQueue(std::move(iterator), dataset()->dtypes_,
+                               dataset()->shapes_);
+      return Status::OK();
+    }
+
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
@@ -372,7 +376,8 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
         if (reader->Contains(iter->full_name("input_exhausted"))) {
           input_impl_.reset();
         } else {
-          input_impl_ = iter->dataset_input()->MakeIterator(iter->prefix());
+          TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
+              ctx, iter->prefix(), &input_impl_));
           TF_RETURN_IF_ERROR(iter->RestoreParent(ctx, reader, input_impl_));
         }
         entries_.clear();
@@ -469,7 +474,7 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
     };
 
    private:
-    TensorQueue* const queue_;
+    TensorQueue* queue_;
   };
 
  private:
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 95708cc01c..fd8780391c 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -70,7 +70,7 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::TensorSlice")}));
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index 2b383e5097..28f2350d6b 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -49,7 +49,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Unbatch")}));
@@ -80,9 +80,12 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params),
             current_index_(0),
             current_batch_size_(0),
-            input_impl_(params.dataset->input_->MakeIterator(params.prefix)),
             shapes_(params.dataset->output_shapes().size()) {}
 
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index e24bdea4ac..e7470f880f 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -26,7 +26,7 @@ class WindowDataset : public DatasetBase {
         output_types_(std::move(output_types)),
         output_shapes_(std::move(output_shapes)) {}
 
-  std::unique_ptr<IteratorBase> MakeIterator(
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::unique_ptr<IteratorBase>(
         new Iterator({this, strings::StrCat(prefix, "::Window")}));
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/writer_ops.cc
index 656fee1e85..80d9a5b867 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/writer_ops.cc
@@ -70,9 +70,13 @@ class ToTFRecordOp : public AsyncOpKernel {
       DatasetBase* dataset;
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
-      auto iterator = dataset->MakeIterator("ToTFRecordOpIterator");
-
       IteratorContext iter_ctx = dataset::MakeIteratorContext(ctx);
+      std::unique_ptr<IteratorBase> iterator;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
+          done);
+
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 0f79eac947..d5343cdf22 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -60,7 +60,7 @@ class ZipDatasetOp : public DatasetOpKernel {
       }
     }
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::Zip")}));
@@ -95,13 +95,16 @@ class ZipDatasetOp : public DatasetOpKernel {
     class Iterator : public DatasetIterator<Dataset> {
      public:
       explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {
-        input_impls_.reserve(params.dataset->inputs_.size());
-        size_t idx = 0;
-        for (const auto& input : params.dataset->inputs_) {
-          input_impls_.emplace_back(input->MakeIterator(
-              strings::StrCat(params.prefix, "[", idx++, "]")));
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        mutex_lock l(mu_);
+        input_impls_.resize(dataset()->inputs_.size());
+        for (size_t i = 0; i < input_impls_.size(); ++i) {
+          TF_RETURN_IF_ERROR(dataset()->inputs_[i]->MakeIterator(
+              ctx, strings::StrCat(prefix(), "[", i, "]"), &input_impls_[i]));
         }
+        return Status::OK();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
-- 
GitLab


From d3b5b07e7810782c3760468312f9cace10b89073 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 31 May 2018 13:58:32 -0700
Subject: [PATCH 114/610] Add attributes to TFLite Python API.

PiperOrigin-RevId: 198774775
---
 tensorflow/contrib/lite/python/convert.py     | 63 ++++++++++++---
 tensorflow/contrib/lite/python/lite.py        | 37 +++++++--
 tensorflow/contrib/lite/python/lite_test.py   | 61 ++++++++++++++
 .../contrib/lite/python/tflite_convert.py     | 81 +++++++++++++++----
 4 files changed, 208 insertions(+), 34 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index c0926d2f33..0819475240 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -115,11 +115,15 @@ def toco_convert(input_data,
                  input_tensors,
                  output_tensors,
                  inference_type=lite_constants.FLOAT,
+                 inference_input_type=None,
                  input_format=lite_constants.TENSORFLOW_GRAPHDEF,
                  output_format=lite_constants.TFLITE,
                  quantized_input_stats=None,
+                 default_ranges_stats=None,
                  drop_control_dependency=True,
-                 allow_custom_ops=False):
+                 reorder_across_fake_quant=False,
+                 allow_custom_ops=False,
+                 change_concat_input_ranges=False):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -130,18 +134,41 @@ def toco_convert(input_data,
     input_tensors: List of input tensors. Type and shape are computed using
       `foo.get_shape()` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
-    inference_type: Currently must be `{FLOAT, QUANTIZED_UINT8}`.
-    input_format: Type of data to read (currently must be TENSORFLOW_GRAPHDEF).
-    output_format: Type of data to write (currently must be TFLITE or
-      GRAPHVIZ_DOT)
-    quantized_input_stats: For each member of input_tensors the mean and
-      std deviation of training data. Only needed if `inference_type` is
-      `QUANTIZED_UINT8`.
-    drop_control_dependency: Drops control dependencies silently. This is due
-      to tf lite not supporting control dependencies.
+    inference_type: Target data type of arrays in the output file. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+    inference_input_type: Target data type of input arrays. Allows for a
+      different type for input arrays in the case of quantization. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+    input_format: Type of data to read Currently must be
+      `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
+    output_format: Output file format. Currently must be `{TFLITE,
+      GRAPHVIZ_DOT}`. (default TFLITE)
+    quantized_input_stats: Dict of strings representing input tensor names
+      mapped to tuple of integers representing the mean and standard deviation
+      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
+      `inference_type` is `QUANTIZED_UINT8`. (default None)
+    default_ranges_stats: Tuple of integers representing (min, max) range values
+      for all arrays without a specified range. Intended for experimenting with
+      quantization via "dummy quantization". (default None)
+    drop_control_dependency: Boolean indicating whether to drop control
+      dependencies silently. This is due to TFLite not supporting control
+      dependencies. (default True)
+    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
+      nodes in unexpected locations. Used when the location of the FakeQuant
+      nodes is preventing graph transformations necessary to convert the graph.
+      Results in a graph that differs from the quantized training graph,
+      potentially causing differing arithmetic behavior. (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
+    allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
+      (default False)
 
   Returns:
-    The converted data. For example if tflite was the destination, then
+    The converted data. For example if TFLite was the destination, then
     this will be a tflite flatbuffer in a bytes array.
 
   Raises:
@@ -152,10 +179,18 @@ def toco_convert(input_data,
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
-  toco.drop_control_dependency = drop_control_dependency
-  model = _model_flags_pb2.ModelFlags()
   toco.inference_type = inference_type
+  if inference_input_type:
+    toco.inference_input_type = inference_input_type
+  toco.drop_control_dependency = drop_control_dependency
+  toco.reorder_across_fake_quant = reorder_across_fake_quant
   toco.allow_custom_ops = allow_custom_ops
+  if default_ranges_stats:
+    toco.default_ranges_min = default_ranges_stats[0]
+    toco.default_ranges_max = default_ranges_stats[1]
+
+  model = _model_flags_pb2.ModelFlags()
+  model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
     if input_tensor.dtype == _dtypes.float32:
       tflite_input_type = lite_constants.FLOAT
@@ -163,6 +198,8 @@ def toco_convert(input_data,
       tflite_input_type = lite_constants.INT32
     elif input_tensor.dtype == _dtypes.int64:
       tflite_input_type = lite_constants.INT64
+    elif input_tensor.dtype == _dtypes.uint8:
+      tflite_input_type = lite_constants.QUANTIZED_UINT8
     # TODO(aselle): Insert strings when they are available
     else:
       raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 6510d74177..d55d8a6f6c 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -64,17 +64,33 @@ class TocoConverter(object):
 
     inference_type: Target data type of arrays in the output file. Currently
       must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+    inference_input_type: Target data type of input arrays. Allows for a
+      different type for input arrays in the case of quantization. Currently
+      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
-    quantized_input_stats: The mean and std deviation of training data for each
-      input tensor. Only needed if `inference_type` is `QUANTIZED_UINT8`.
-      Dict of strings representing input tensor names to a tuple of integers
-      representing the quantization stats (e.g., {"foo" : (0., 1.)}).
-      (default {})
+    quantized_input_stats: Dict of strings representing input tensor names
+      mapped to tuple of integers representing the mean and standard deviation
+      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
+      `inference_type` is `QUANTIZED_UINT8`. (default {})
+    default_ranges_stats: Tuple of integers representing (min, max) range values
+      for all arrays without a specified range. Intended for experimenting with
+      quantization via "dummy quantization". (default None)
     drop_control_dependency: Boolean indicating whether to drop control
       dependencies silently. This is due to TFLite not supporting control
       dependencies. (default True)
+    reorder_across_fake_quant: Boolean indicating whether to reorder FakeQuant
+      nodes in unexpected locations. Used when the location of the FakeQuant
+      nodes is preventing graph transformations necessary to convert the graph.
+      Results in a graph that differs from the quantized training graph,
+      potentially causing differing arithmetic behavior. (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
     allow_custom_ops: Boolean indicating whether to allow custom operations.
+      When false any unknown operation is an error. When true, custom ops are
+      created for any op that is unknown. The developer will need to provide
+      these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
 
   Example usage:
@@ -109,9 +125,13 @@ class TocoConverter(object):
     self._input_tensors = input_tensors
     self._output_tensors = output_tensors
     self.inference_type = constants.FLOAT
+    self.inference_input_type = None
     self.output_format = constants.TFLITE
     self.quantized_input_stats = {}
+    self.default_ranges_stats = None
     self.drop_control_dependency = True
+    self.reorder_across_fake_quant = False
+    self.change_concat_input_ranges = False
     self.allow_custom_ops = False
 
   @classmethod
@@ -270,10 +290,15 @@ class TocoConverter(object):
         input_tensors=self._input_tensors,
         output_tensors=self._output_tensors,
         inference_type=self.inference_type,
+        inference_input_type=self.inference_input_type,
         input_format=constants.TENSORFLOW_GRAPHDEF,
         output_format=self.output_format,
         quantized_input_stats=quantized_stats,
-        drop_control_dependency=self.drop_control_dependency)
+        default_ranges_stats=self.default_ranges_stats,
+        drop_control_dependency=self.drop_control_dependency,
+        reorder_across_fake_quant=self.reorder_across_fake_quant,
+        change_concat_input_ranges=self.change_concat_input_ranges,
+        allow_custom_ops=self.allow_custom_ops)
     return result
 
   def _set_batch_size(self, batch_size):
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 28386ecb1a..1b0cdb90ce 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -220,6 +220,67 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     graphviz_output = converter.convert()
     self.assertTrue(graphviz_output)
 
+  def testInferenceInputType(self):
+    in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], dtype=dtypes.uint8)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    converter.inference_input_type = lite_constants.QUANTIZED_UINT8
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+  def testDefaultRangesStats(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    converter.inference_type = lite_constants.QUANTIZED_UINT8
+    converter.quantized_input_stats = {'Placeholder': (0., 1.)}  # mean, std_dev
+    converter.default_ranges_stats = (0, 6)  # min, max
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('Placeholder', input_details[0]['name'])
+    self.assertEqual(np.uint8, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((1., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add', output_details[0]['name'])
+    self.assertEqual(np.uint8, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
+
 
 class FromFlatbufferFile(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 79be5cdc56..38068bee08 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -91,6 +91,9 @@ def _convert_model(flags):
   converter = _get_toco_converter(flags)
   if flags.inference_type:
     converter.inference_type = _types_pb2.IODataType.Value(flags.inference_type)
+  if flags.inference_input_type:
+    converter.inference_input_type = _types_pb2.IODataType.Value(
+        flags.inference_input_type)
   if flags.output_format:
     converter.output_format = _toco_flags_pb2.FileFormat.Value(
         flags.output_format)
@@ -101,9 +104,16 @@ def _convert_model(flags):
     mean_values = _parse_int_array(flags.mean_values)
     quant_stats = zip(mean_values, std_dev_values)
     converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
+  if flags.default_ranges_min and flags.default_ranges_max:
+    converter.default_ranges_stats = (flags.default_ranges_min,
+                                      flags.default_ranges_max)
 
   if flags.drop_control_dependency:
     converter.drop_control_dependency = flags.drop_control_dependency
+  if flags.reorder_across_fake_quant:
+    converter.reorder_across_fake_quant = flags.reorder_across_fake_quant
+  if flags.change_concat_input_ranges:
+    converter.change_concat_input_ranges = flags.change_concat_input_ranges
   if flags.allow_custom_ops:
     converter.allow_custom_ops = flags.allow_custom_ops
 
@@ -116,8 +126,8 @@ def _convert_model(flags):
 def _check_flags(flags, unparsed):
   """Checks the parsed and unparsed flags to ensure they are valid.
 
-  Displays warnings for unparsed flags. Raises an error for parsed flags that
-  don't meet the required conditions.
+  Raises an error if previously support unparsed flags are found. Raises an
+  error for parsed flags that don't meet the required conditions.
 
   Args:
     flags: argparse.Namespace object containing TFLite flags.
@@ -126,17 +136,20 @@ def _check_flags(flags, unparsed):
   Raises:
     ValueError: Invalid flags.
   """
+
   # Check unparsed flags for common mistakes based on previous TOCO.
+  def _get_message_unparsed(flag, orig_flag, new_flag):
+    if flag.startswith(orig_flag):
+      return "\n  Use {0} instead of {1}".format(new_flag, orig_flag)
+    return ""
+
   if unparsed:
-    print("tflite_convert: warning: Unable to parse following flags "
-          "'{}'".format(",".join(unparsed)))
+    output = ""
     for flag in unparsed:
-      if "--input_file=" in flag:
-        print("tflite_convert: warning: Use --graph_def_file instead of "
-              "--input_file")
-      if "--std_values=" in flag:
-        print("tflite_convert: warning: Use --std_dev_values instead of "
-              "--std_values")
+      output += _get_message_unparsed(flag, "--input_file", "--graph_def_file")
+      output += _get_message_unparsed(flag, "--std_value", "--std_dev_values")
+      output += _get_message_unparsed(flag, "--batch_size", "--input_shapes")
+    raise ValueError(output)
 
   # Check that flags are valid.
   if flags.graph_def_file and (not flags.input_arrays or
@@ -163,6 +176,10 @@ def _check_flags(flags, unparsed):
       raise ValueError("--std_dev_values, --mean_values, and --input_arrays "
                        "must have the same number of items")
 
+  if bool(flags.default_ranges_min) != bool(flags.default_ranges_max):
+    raise ValueError("--default_ranges_min and --default_ranges_max must be "
+                     "used together")
+
 
 def run_main(_):
   """Main in toco_convert.py."""
@@ -199,6 +216,12 @@ def run_main(_):
       type=str,
       choices=["FLOAT", "QUANTIZED_UINT8"],
       help="Target data type of arrays in the output file.")
+  parser.add_argument(
+      "--inference_input_type",
+      type=str,
+      choices=["FLOAT", "QUANTIZED_UINT8"],
+      help=("Target data type of input arrays. Allows for a different type for "
+            "input arrays in the case of quantization."))
 
   # Input and output arrays flags.
   parser.add_argument(
@@ -218,12 +241,13 @@ def run_main(_):
   parser.add_argument(
       "--saved_model_tag_set",
       type=str,
-      help=("Set of tags identifying the MetaGraphDef within the SavedModel "
-            "to analyze. All tags must be present. (default \"serve\")"))
+      help=("Comma-separated set of tags identifying the MetaGraphDef within "
+            "the SavedModel to analyze. All tags must be present. "
+            "(default \"serve\")"))
   parser.add_argument(
       "--saved_model_signature_key",
       type=str,
-      help=("Key identifying SignatureDef containing inputs and outputs. "
+      help=("Key identifying the SignatureDef containing inputs and outputs. "
             "(default DEFAULT_SERVING_SIGNATURE_DEF_KEY)"))
 
   # Quantization flags.
@@ -237,14 +261,41 @@ def run_main(_):
       type=str,
       help=("Mean of training data for each input tensor, comma-separated. "
             "Used for quantization. (default None)"))
+  parser.add_argument(
+      "--default_ranges_min",
+      type=int,
+      help=("Default value for min bound of min/max range values used for all "
+            "arrays without a specified range, Intended for experimenting with "
+            "quantization via \"dummy quantization\". (default None)"))
+  parser.add_argument(
+      "--default_ranges_max",
+      type=int,
+      help=("Default value for max bound of min/max range values used for all "
+            "arrays without a specified range, Intended for experimenting with "
+            "quantization via \"dummy quantization\". (default None)"))
 
   # Graph manipulation flags.
   parser.add_argument(
       "--drop_control_dependency",
       type=bool,
       help=("Boolean indicating whether to drop control dependencies silently. "
-            "This is due to TensorFlow Lite not supporting control "
-            "dependencies. (default True)"))
+            "This is due to TensorFlow not supporting control dependencies. "
+            "(default True)"))
+  parser.add_argument(
+      "--reorder_across_fake_quant",
+      type=bool,
+      help=("Boolean indicating whether to reorder FakeQuant nodes in "
+            "unexpected locations. Used when the location of the FakeQuant "
+            "nodes is preventing graph transformations necessary to convert "
+            "the graph. Results in a graph that differs from the quantized "
+            "training graph, potentially causing differing arithmetic "
+            "behavior. (default False)"))
+  parser.add_argument(
+      "--change_concat_input_ranges",
+      type=bool,
+      help=("Boolean to change behavior of min/max ranges for inputs and "
+            "outputs of the concat operator for quantized models. Changes the "
+            "ranges of concat operator overlap when true. (default False)"))
   parser.add_argument(
       "--allow_custom_ops",
       type=bool,
-- 
GitLab


From 395428bcaf02c9a9e8067083993d7e6b5afdc0a6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 14:01:45 -0700
Subject: [PATCH 115/610] Move RemodeRedundantReshape optimization to a
 separate stage.

PiperOrigin-RevId: 198775276
---
 .../optimizers/arithmetic_optimizer.cc        | 114 ++++++++++--------
 .../optimizers/arithmetic_optimizer.h         |   1 +
 .../optimizers/arithmetic_optimizer_test.cc   |  90 +++++++-------
 3 files changed, 111 insertions(+), 94 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index e7f385cbd6..0edea16aac 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -196,22 +196,6 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
 
 bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 
-// Returns whether `reshape` is an identity op. The tensor that `reshape`
-// reshapes is the `output_pos`-th output of node `input`.
-bool ReshapeIsIdentity(const NodeDef& reshape, const NodeDef& input,
-                       const int output_pos,
-                       const GraphProperties& graph_properties) {
-  const std::vector<OpInfo::TensorProperties>& reshape_props =
-      graph_properties.GetOutputProperties(reshape.name());
-  const std::vector<OpInfo::TensorProperties>& input_props =
-      graph_properties.GetOutputProperties(input.name());
-  if (reshape_props.empty() || input_props.size() <= output_pos) {
-    return false;
-  }
-
-  return ShapesSymbolicallyEqual(input_props[output_pos], reshape_props[0]);
-}
-
 NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
@@ -1823,6 +1807,65 @@ class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
   }
 };
 
+// Bypass redundant reshape nodes:
+//
+//   Reshape                    Reshape  <-+
+//      ^                                  |
+//      |                                  |
+//   Reshape       becomes      Reshape    |
+//      ^                                  |
+//      |                                  |
+//    input                      input  ---+
+class RemoveRedundantReshape : public ArithmeticOptimizerStage {
+ public:
+  explicit RemoveRedundantReshape(const GraphOptimizerContext& ctx,
+                                  const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("RemoveRedundantReshape", ctx, ctx_ext) {}
+  ~RemoveRedundantReshape() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsReshape(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+
+    // 1. Bypass reshape followed by reshape.
+    if (IsReshape(*input) && !HasControlInputs(*input)) {
+      node->set_input(0, input->input(0));
+      ctx().node_map->UpdateInput(node->name(), input->name(), input->input(0));
+      *simplified_node_name = node->name();
+      AddToOptimizationQueue(node);
+      return Status::OK();
+    }
+
+    // 2. If the reshape is a no-op, forward its input to its consumers, unless
+    // it anchors a control dependency since we want to make sure that control
+    // dependency is triggered.
+    if (ReshapeIsIdentity(*node) && !HasControlInputs(*node)) {
+      *simplified_node_name = node->input(0);
+      return Status::OK();
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // Returns whether `reshape` is an identity op.
+  bool ReshapeIsIdentity(const NodeDef& reshape) {
+    OpInfo::TensorProperties reshape_props;
+    OpInfo::TensorProperties input_props;
+
+    if (!GetTensorProperties(reshape.name(), &reshape_props).ok() ||
+        !GetTensorProperties(reshape.input(0), &input_props).ok()) {
+      return false;
+    }
+
+    return ShapesSymbolicallyEqual(input_props.shape(), reshape_props.shape());
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2076,43 +2119,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
 
-  if (node->op() == "Reshape") {
-    //   Reshape
-    //      ^
-    //      |
-    //   Reshape
-    //      ^
-    //      |
-    //    input
-    //
-    // becomes
-    //
-    //   Reshape <-+
-    //             |
-    //   Reshape   |
-    //      ^      |
-    //      |      |
-    //    input ---+
-    NodeDef* reshape = const_cast<NodeDef*>(node);
-    int output_pos = 0;
-    string input_node_name = ParseNodeName(reshape->input(0), &output_pos);
-    const NodeDef* input = node_map_->GetNode(input_node_name);
-    if (input->op() == "Reshape" && !HasControlInputs(*input)) {
-      reshape->set_input(0, input->input(0));
-      node_map_->UpdateInput(reshape->name(), input->name(), input->input(0));
-      nodes_to_simplify->PushBack(reshape);
-      return reshape->name();
-    }
-
-    // If the reshape is a no-op, forward its input to its consumers, unless it
-    // anchors a control dependency since we want to make sure that control
-    // dependency is triggered.
-    if (ReshapeIsIdentity(*reshape, *input, output_pos, *graph_properties_) &&
-        !HasControlInputs(*reshape)) {
-      return reshape->input(0);
-    }
-  }
-
   if (node->op() == "Transpose") {
     // Reorder Cast and Transpose if beneficial.
     //
@@ -2450,6 +2456,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<RemoveRedundantBitcastStage>(ctx, ctx_ext);
   if (options_.remove_redundant_cast)
     pipeline.AddStage<RemoveRedundantCastStage>(ctx, ctx_ext);
+  if (options_.remove_redundant_reshape)
+    pipeline.AddStage<RemoveRedundantReshape>(ctx, ctx_ext);
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
   if (options_.remove_logical_not)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 962399119d..9f8ec85e77 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -71,6 +71,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_negation = true;
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
+    bool remove_redundant_reshape = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index f678ea7227..43355ef945 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -124,6 +124,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.remove_idempotent = false;
     options.remove_redundant_bitcast = false;
     options.remove_redundant_cast = false;
+    options.remove_redundant_reshape = false;
     options.remove_negation = false;
     options.remove_logical_not = false;
     optimizer->options_ = options;
@@ -168,6 +169,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.remove_redundant_cast = true;
   }
 
+  void EnableOnlyRemoveRedundantReshape(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.remove_redundant_reshape = true;
+  }
+
   void EnableOnlyRemoveNegation(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_negation = true;
@@ -955,7 +961,7 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_IdentityReshape) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, 28, 28}));
@@ -977,11 +983,11 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   auto tensors_expected =
       EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
@@ -989,7 +995,8 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshape) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, IdentityReshapeBetweenSymbolicShapes) {
+TEST_F(ArithmeticOptimizerTest,
+       RemoveRedundantReshape_IdentityReshapeBetweenSymbolicShapes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({-1, 3, -1, -1}));
@@ -1009,27 +1016,28 @@ TEST_F(ArithmeticOptimizerTest, IdentityReshapeBetweenSymbolicShapes) {
   Output reshape = ops::Reshape(s, inputs, target_shape);
   Output outputs = ops::Identity(s.WithOpName("outputs"), reshape);
 
+  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
   GrapplerItem item;
   item.fetch = {"outputs"};
+  item.feed = {{"Placeholder", x_t}};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  auto x_t = GenerateRandomTensor<DT_FLOAT>(TensorShape({3, 3, 28, 28}));
-  auto tensors_expected =
-      EvaluateNodes(item.graph, item.fetch, {{"Placeholder", x_t}});
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
-                   .Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  // Assume valid feed shape in aggressive mode.
+  ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
-  auto tensors = EvaluateNodes(output, item.fetch, {{"Placeholder", x_t}});
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, NotAssumeValidFeeds) {
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_NotAssumeValidFeeds) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3, 28, 28}));
@@ -1047,10 +1055,9 @@ TEST_F(ArithmeticOptimizerTest, NotAssumeValidFeeds) {
   EXPECT_EQ(1, tensors_expected.size());
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   // The reshape is preserved because the shape of the placeholder can be
   // different from the shape of the actual feed.
@@ -1061,7 +1068,8 @@ TEST_F(ArithmeticOptimizerTest, NotAssumeValidFeeds) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, AssumeValidFeedsInAggressiveMode) {
+TEST_F(ArithmeticOptimizerTest,
+       RemoveRedundantReshape_AssumeValidFeedsInAggressiveMode) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3, 28, 28}));
@@ -1077,12 +1085,11 @@ TEST_F(ArithmeticOptimizerTest, AssumeValidFeedsInAggressiveMode) {
 
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer(RewriterConfig::AGGRESSIVE)
-                   .Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(0, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
@@ -1090,7 +1097,7 @@ TEST_F(ArithmeticOptimizerTest, AssumeValidFeedsInAggressiveMode) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_NotIdentityReshape) {
   // Reshape from [-1,3,28,28] to [8,-1,28,28] is not identity, because it can
   // be from [4,3,28,28] to [8,6,28,28].
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -1106,11 +1113,11 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   item.feed = {{"Placeholder", x_t}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
@@ -1118,7 +1125,8 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshape) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
+TEST_F(ArithmeticOptimizerTest,
+       RemoveRedundantReshape_NotIdentityReshapeTooManyUnknownDimSizes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output inputs =
       ops::Placeholder(s, DT_FLOAT, ops::Placeholder::Shape({4, 3}));
@@ -1128,16 +1136,16 @@ TEST_F(ArithmeticOptimizerTest, NotIdentityReshapeTooManyUnknownDimSizes) {
   GrapplerItem item;
   item.fetch = {"outputs"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
 }
 
-TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
+TEST_F(ArithmeticOptimizerTest, RemoveRedundantReshape_CombineReshapes) {
   // Converts an NCHW_VECT_C tensor to NHWC and then flattens it to 2D. The two
   // reshapes should be combined.
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -1162,11 +1170,11 @@ TEST_F(ArithmeticOptimizerTest, CombineReshapes) {
   item.feed = {{"nchw_vect_c", x_t}};
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
   EXPECT_EQ(1, tensors_expected.size());
-  GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyRemoveRedundantReshape(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   EXPECT_EQ(1, CountOpNodes(output, "Reshape"));
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-- 
GitLab


From a18cb8741048e888ca854576f4ef352004344e0b Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 31 May 2018 14:24:13 -0700
Subject: [PATCH 116/610] Mark XLAShapeForArgument as const.

PiperOrigin-RevId: 198778945
---
 tensorflow/compiler/tf2xla/xla_compiler.cc | 2 +-
 tensorflow/compiler/tf2xla/xla_compiler.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index 2fce6166d4..a8bd199675 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -225,7 +225,7 @@ Status XlaCompiler::CompileFunction(const XlaCompiler::CompileOptions& options,
 // Computes the XLA shape for argument 'arg'.
 Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
                                         bool is_entry_computation,
-                                        xla::Shape* xla_shape) {
+                                        xla::Shape* xla_shape) const {
   switch (arg.kind) {
     case XlaCompiler::Argument::kConstant:
       LOG(FATAL) << "Unreachable case";
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 76f4c4c1ea..c93850ce27 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -314,7 +314,7 @@ class XlaCompiler {
   // See the class comment for more details about the argument passing
   // convention.
   Status XLAShapeForArgument(const Argument& arg, bool is_entry_computation,
-                             xla::Shape* xla_shape);
+                             xla::Shape* xla_shape) const;
 
   // Retrieves the channel handle associated with `key`. Allocates
   // a new channel handle if none exists.
-- 
GitLab


From 15ef74e6b733604a417a1e19435e1d8b08f67b7d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 14:42:07 -0700
Subject: [PATCH 117/610] Expose the ExponentialMovingAverage name as a public
 property.

PiperOrigin-RevId: 198782348
---
 tensorflow/python/training/moving_averages.py       | 13 +++++++++----
 tensorflow/python/training/moving_averages_test.py  |  1 +
 ...nsorflow.train.-exponential-moving-average.pbtxt |  4 ++++
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index 61fc828a84..60cc54c264 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -344,6 +344,11 @@ class ExponentialMovingAverage(object):
     self._name = name
     self._averages = {}
 
+  @property
+  def name(self):
+    """The name of this ExponentialMovingAverage object."""
+    return self._name
+
   def apply(self, var_list=None):
     """Maintains moving averages of variables.
 
@@ -394,7 +399,7 @@ class ExponentialMovingAverage(object):
         if isinstance(var, variables.Variable):
           avg = slot_creator.create_slot(var,
                                          var.initialized_value(),
-                                         self._name,
+                                         self.name,
                                          colocate_with_primary=True)
           # NOTE(mrry): We only add `tf.Variable` objects to the
           # `MOVING_AVERAGE_VARIABLES` collection.
@@ -402,7 +407,7 @@ class ExponentialMovingAverage(object):
         else:
           avg = slot_creator.create_zeros_slot(
               var,
-              self._name,
+              self.name,
               colocate_with_primary=(var.op.type in ["Variable",
                                                      "VariableV2",
                                                      "VarHandleOp"]))
@@ -410,7 +415,7 @@ class ExponentialMovingAverage(object):
             zero_debias_true.add(avg)
       self._averages[var] = avg
 
-    with ops.name_scope(self._name) as scope:
+    with ops.name_scope(self.name) as scope:
       decay = ops.convert_to_tensor(self._decay, name="decay")
       if self._num_updates is not None:
         num_updates = math_ops.cast(self._num_updates,
@@ -462,7 +467,7 @@ class ExponentialMovingAverage(object):
     if var in self._averages:
       return self._averages[var].op.name
     return ops.get_default_graph().unique_name(
-        var.op.name + "/" + self._name, mark_as_used=False)
+        var.op.name + "/" + self.name, mark_as_used=False)
 
   def variables_to_restore(self, moving_avg_variables=None):
     """Returns a map of names to `Variables` to restore.
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index 6717811bbb..3e85e6bfa7 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -263,6 +263,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       tensor2 = v0 + v1
       ema = moving_averages.ExponentialMovingAverage(
           0.25, zero_debias=zero_debias, name="foo")
+      self.assertEqual("foo", ema.name)
       self.assertEqual("v0/foo", ema.average_name(v0))
       self.assertEqual("v1/foo", ema.average_name(v1))
       self.assertEqual("add/foo", ema.average_name(tensor2))
diff --git a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
index 737acbe07c..c9fe136e68 100644
--- a/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.train.-exponential-moving-average.pbtxt
@@ -2,6 +2,10 @@ path: "tensorflow.train.ExponentialMovingAverage"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.moving_averages.ExponentialMovingAverage\'>"
   is_instance: "<type \'object\'>"
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
   member_method {
     name: "__init__"
     argspec: "args=[\'self\', \'decay\', \'num_updates\', \'zero_debias\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'ExponentialMovingAverage\'], "
-- 
GitLab


From b183563d0bfed9fce2b623b3bff3fa3bdeccad54 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 14:48:51 -0700
Subject: [PATCH 118/610] Write checkpoint path of evaluated checkpoint to the
 event file.

PiperOrigin-RevId: 198783364
---
 tensorflow/python/estimator/estimator.py      | 36 ++++++++++++++++++-
 tensorflow/python/estimator/estimator_test.py | 33 ++++++++++++-----
 2 files changed, 59 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index cfbf7e2ce5..4f57a4ef79 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -38,9 +38,11 @@ from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import util as estimator_util
 from tensorflow.python.estimator.export import export as export_helpers
 from tensorflow.python.estimator.export import export_output
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import metrics as metrics_lib
@@ -1383,10 +1385,18 @@ class Estimator(object):
         hooks=all_hooks,
         config=self._session_config)
 
+    current_global_step = eval_results[ops.GraphKeys.GLOBAL_STEP]
+
     _write_dict_to_summary(
         output_dir=output_dir,
         dictionary=eval_results,
-        current_global_step=eval_results[ops.GraphKeys.GLOBAL_STEP])
+        current_global_step=current_global_step)
+
+    if checkpoint_path:
+      _write_checkpoint_path_to_summary(
+          output_dir=output_dir,
+          checkpoint_path=checkpoint_path,
+          current_global_step=current_global_step)
 
     return eval_results
 
@@ -1585,6 +1595,30 @@ def _write_dict_to_summary(output_dir,
   summary_writer.flush()
 
 
+def _write_checkpoint_path_to_summary(output_dir, checkpoint_path,
+                                      current_global_step):
+  """Writes `checkpoint_path` into summary file in the given output directory.
+
+  Args:
+    output_dir: `str`, directory to write the summary file in.
+    checkpoint_path: `str`, checkpoint file path to be written to summary file.
+    current_global_step: `int`, the current global step.
+  """
+
+  checkpoint_path_tag = 'checkpoint_path'
+
+  logging.info('Saving \'%s\' summary for global step %d: %s',
+               checkpoint_path_tag, current_global_step, checkpoint_path)
+  summary_proto = summary_pb2.Summary()
+  summary_proto.value.add(
+      tag=checkpoint_path_tag,
+      tensor=tensor_util.make_tensor_proto(
+          checkpoint_path, dtype=dtypes.string))
+  summary_writer = writer_cache.FileWriterCache.get(output_dir)
+  summary_writer.add_summary(summary_proto, current_global_step)
+  summary_writer.flush()
+
+
 def _has_dataset_or_queue_runner(maybe_tensor):
   """Returns True if TF dataset or QueueRunner has been used."""
   # Check TF dataset first. Here, we use a simple algorithm to check the top
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index a9f20f7fa4..9c0d0f7390 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.layers import layers
 from tensorflow.python.lib.io import file_io
@@ -81,21 +82,22 @@ def dummy_model_fn(features, labels, params):
   _, _, _ = features, labels, params
 
 
-def check_eventfile_for_keyword(keyword, dir_):
-  """Checks event files for the keyword."""
+def summaries_with_matching_keyword(keyword, dir_):
+  """Yields summary protos matching given keyword from event file."""
 
   writer_cache.FileWriterCache.clear()
 
-  # Get last Event written.
   event_paths = glob.glob(os.path.join(dir_, 'events*'))
-  last_event = None
-  for last_event in summary_iterator.summary_iterator(event_paths[-1]):
-    if last_event.summary is not None:
-      for value in last_event.summary.value:
+  for event in summary_iterator.summary_iterator(event_paths[-1]):
+    if event.summary is not None:
+      for value in event.summary.value:
         if keyword in value.tag:
-          return True
+          yield event.summary
+
 
-  return False
+def check_eventfile_for_keyword(keyword, dir_):
+  """Checks event files for the keyword."""
+  return any(summaries_with_matching_keyword(keyword, dir_))
 
 
 class EstimatorInheritanceConstraintTest(test.TestCase):
@@ -1398,6 +1400,19 @@ class EstimatorEvaluateTest(test.TestCase):
           check_eventfile_for_keyword(key, est.eval_dir()),
           '{} should be part of reported summaries.'.format(key))
 
+    # Verify that evaluated checkpoint path is written to event file.
+    checkpoint_path_tag = 'checkpoint_path'
+    self.assertTrue(
+        check_eventfile_for_keyword(checkpoint_path_tag, est.eval_dir()),
+        '{} should be part of reported summaries.'.format(checkpoint_path_tag))
+
+    expected_tensor_proto = tensor_util.make_tensor_proto(
+        est.latest_checkpoint(), dtype=dtypes.string)
+    summaries = summaries_with_matching_keyword(checkpoint_path_tag,
+                                                est.eval_dir())
+    self.assertProtoEquals(expected_tensor_proto,
+                           next(summaries).value[0].tensor)
+
 
 class EstimatorPredictTest(test.TestCase):
 
-- 
GitLab


From f21816ecefe3f6e554d3b7daae3bb7f7a03bad20 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 15:05:23 -0700
Subject: [PATCH 119/610] Similar to cr/188652533, specify the
 `maximum_iterations` to tf.while_loop in tf.map_fn to be compatible with XLA.

PiperOrigin-RevId: 198786266
---
 tensorflow/python/ops/functional_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index 394ad0b1a2..30413f289a 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -455,7 +455,8 @@ def map_fn(fn, elems, dtype=None, parallel_iterations=10, back_prop=True,
         lambda i, _: i < n, compute, (i, accs_ta),
         parallel_iterations=parallel_iterations,
         back_prop=back_prop,
-        swap_memory=swap_memory)
+        swap_memory=swap_memory,
+        maximum_iterations=n)
     results_flat = [r.stack() for r in r_a]
 
     n_static = elems_flat[0].get_shape().with_rank_at_least(1)[0]
-- 
GitLab


From 269a4ed1c27251b55cffe578b7bd969ec5975487 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 15:11:26 -0700
Subject: [PATCH 120/610] Internal change.

PiperOrigin-RevId: 198787391
---
 tensorflow/contrib/lite/kernels/basic_rnn.cc  | 41 ++++++++++++-------
 .../lite/kernels/internal/kernel_utils.cc     |  7 +---
 .../lite/kernels/internal/kernel_utils.h      |  6 ++-
 .../kernels/unidirectional_sequence_rnn.cc    | 41 ++++++++++++-------
 4 files changed, 60 insertions(+), 35 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/basic_rnn.cc b/tensorflow/contrib/lite/kernels/basic_rnn.cc
index 7dc0c5656d..c09b15b3d2 100644
--- a/tensorflow/contrib/lite/kernels/basic_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/basic_rnn.cc
@@ -36,7 +36,7 @@ constexpr int kOutputTensor = 1;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* scratch_tensor_index = new int;
-  context->AddTensors(context, /*tensors_to_add=*/2, scratch_tensor_index);
+  context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index);
   return scratch_tensor_index;
 }
 
@@ -91,7 +91,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
     TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(2);
+    node->temporaries = TfLiteIntArrayCreate(3);
     node->temporaries->data[0] = *scratch_tensor_index;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
     input_quantized->type = kTfLiteUInt8;
@@ -114,6 +114,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                         context->ResizeTensor(context, hidden_state_quantized,
                                               hidden_state_quantized_size));
     }
+    node->temporaries->data[2] = *scratch_tensor_index + 2;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = batch_size;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
   }
 
   return kTfLiteOk;
@@ -145,14 +155,14 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantized(const TfLiteTensor* input,
-                           const TfLiteTensor* input_weights,
-                           const TfLiteTensor* recurrent_weights,
-                           const TfLiteTensor* bias,
-                           const TfLiteRNNParams* params,
-                           TfLiteTensor* input_scratch,
-                           TfLiteTensor* hidden_state_scratch,
-                           TfLiteTensor* hidden_state, TfLiteTensor* output) {
+TfLiteStatus EvalHybrid(const TfLiteTensor* input,
+                        const TfLiteTensor* input_weights,
+                        const TfLiteTensor* recurrent_weights,
+                        const TfLiteTensor* bias, const TfLiteRNNParams* params,
+                        TfLiteTensor* input_scratch,
+                        TfLiteTensor* hidden_state_scratch,
+                        TfLiteTensor* scaling_factors,
+                        TfLiteTensor* hidden_state, TfLiteTensor* output) {
   const int batch_size = input->dims->data[0];
   const int num_units = input_weights->dims->data[0];
   const int input_size = input->dims->data[1];
@@ -176,12 +186,14 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input,
       reinterpret_cast<int8_t*>(input_scratch->data.uint8);
   int8_t* quantized_hidden_state_ptr =
       reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
 
   kernel_utils::RnnBatchStep(
       input_ptr_batch, input_weights_ptr, input_weights_scale,
       recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
       num_units, batch_size, params->activation, quantized_input_ptr,
-      quantized_hidden_state_ptr, hidden_state_ptr_batch, output_ptr_batch);
+      quantized_hidden_state_ptr, scaling_factors_ptr, hidden_state_ptr_batch,
+      output_ptr_batch);
   return kTfLiteOk;
 }
 
@@ -205,9 +217,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       // TODO(mirkov): implement eval with quantized inputs as well.
       TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
       TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
-      return EvalQuantized(input, input_weights, recurrent_weights, bias,
-                           params, input_quantized, hidden_state_quantized,
-                           hidden_state, output);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, 2);
+      return EvalHybrid(input, input_weights, recurrent_weights, bias, params,
+                        input_quantized, hidden_state_quantized,
+                        scaling_factors, hidden_state, output);
     }
     default:
       context->ReportError(context, "Type %d not currently supported.",
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 3bbaaa6a9d..67e3810479 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -52,7 +52,8 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
                   TfLiteFusedActivation activation,
                   int8_t* quantized_input_ptr_batch,
                   int8_t* quantized_hidden_state_ptr_batch,
-                  float* hidden_state_ptr_batch, float* output_ptr_batch) {
+                  float* scaling_factors, float* hidden_state_ptr_batch,
+                  float* output_ptr_batch) {
   // Output = bias
   tensor_utils::VectorBatchVectorAssign(bias_ptr, num_units, batch_size,
                                         output_ptr_batch);
@@ -62,7 +63,6 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
     // Quantize input from float to uint8 + quantization params (scaling
     // factor).
     float unused_min, unused_max;
-    float* scaling_factors = new float[batch_size];
     for (int b = 0; b < batch_size; ++b) {
       const int offset = b * input_size;
       tensor_utils::SymmetricQuantizeFloats(
@@ -76,7 +76,6 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_weights_ptr, num_units, input_size, quantized_input_ptr_batch,
         scaling_factors, batch_size, output_ptr_batch, /*result_stride=*/1);
-    delete[] scaling_factors;
   }
 
   // Save quantization and matmul computation for all zero input.
@@ -84,7 +83,6 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
                                   batch_size * num_units)) {
     // Quantize hidden_state
     float unused_min, unused_max;
-    float* scaling_factors = new float[batch_size];
     for (int b = 0; b < batch_size; ++b) {
       const int offset = b * num_units;
       tensor_utils::SymmetricQuantizeFloats(
@@ -99,7 +97,6 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
         recurrent_weights_ptr, num_units, num_units,
         quantized_hidden_state_ptr_batch, scaling_factors, batch_size,
         output_ptr_batch, /*result_stride=*/1);
-    delete[] scaling_factors;
   }
 
   // Output = activation(Output) and update hidden_state
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index cbfbcbeefc..f3f42f0840 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -41,6 +41,9 @@ void RnnBatchStep(const float* input_ptr_batch, const float* input_weights_ptr,
 // values of hidden_state_ptr_batch and input_ptr_batch, respectively.
 // These temporary storages are expected to be preallocated to the same size as
 // the respective pointers.
+// An additional preallocated temporary storage 'scaling_factors' (of size
+// batch_size) is used to store the scaling factors of the quantization (used
+// for recovery).
 // {input,recurrent}_weights_scale params are used for dequantization/recovery.
 void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
                   float input_weights_scale,
@@ -50,7 +53,8 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
                   TfLiteFusedActivation activation,
                   int8_t* quantized_input_ptr_batch,
                   int8_t* quantized_hidden_state_ptr_batch,
-                  float* hidden_state_ptr_batch, float* output_ptr_batch);
+                  float* scaling_factors, float* hidden_state_ptr_batch,
+                  float* output_ptr_batch);
 
 // Performs an LSTM batch inference step for input specified by input_ptr_batch.
 // The LSTM cell is specified by the pointers to its weights (*_weights_ptr) and
diff --git a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
index 8429dba54b..164a0cbd08 100644
--- a/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/contrib/lite/kernels/unidirectional_sequence_rnn.cc
@@ -41,7 +41,7 @@ constexpr int kOutputTensor = 1;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* scratch_tensor_index = new int;
-  context->AddTensors(context, /*tensors_to_add=*/2, scratch_tensor_index);
+  context->AddTensors(context, /*tensors_to_add=*/3, scratch_tensor_index);
   return scratch_tensor_index;
 }
 
@@ -102,7 +102,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   if (input->type == kTfLiteFloat32 && input_weights->type == kTfLiteUInt8) {
     int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
     TfLiteIntArrayFree(node->temporaries);
-    node->temporaries = TfLiteIntArrayCreate(2);
+    node->temporaries = TfLiteIntArrayCreate(3);
     node->temporaries->data[0] = *scratch_tensor_index;
     TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/0);
     input_quantized->type = kTfLiteUInt8;
@@ -125,6 +125,16 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                         context->ResizeTensor(context, hidden_state_quantized,
                                               hidden_state_quantized_size));
     }
+    node->temporaries->data[2] = *scratch_tensor_index + 2;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/2);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = batch_size;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
   }
   return kTfLiteOk;
 }
@@ -187,14 +197,12 @@ TfLiteStatus EvalFloat(const TfLiteTensor* input,
   return kTfLiteOk;
 }
 
-TfLiteStatus EvalQuantized(const TfLiteTensor* input,
-                           const TfLiteTensor* input_weights,
-                           const TfLiteTensor* recurrent_weights,
-                           const TfLiteTensor* bias,
-                           const TfLiteSequenceRNNParams* params,
-                           TfLiteTensor* input_scratch,
-                           TfLiteTensor* hidden_state_scratch,
-                           TfLiteTensor* hidden_state, TfLiteTensor* output) {
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_weights,
+    const TfLiteTensor* recurrent_weights, const TfLiteTensor* bias,
+    const TfLiteSequenceRNNParams* params, TfLiteTensor* input_scratch,
+    TfLiteTensor* hidden_state_scratch, TfLiteTensor* scaling_factors,
+    TfLiteTensor* hidden_state, TfLiteTensor* output) {
   const bool time_major = params->time_major;
   const int batch_size =
       (time_major) ? input->dims->data[1] : input->dims->data[0];
@@ -218,6 +226,7 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input,
       reinterpret_cast<int8_t*>(input_scratch->data.uint8);
   int8_t* quantized_hidden_state_ptr =
       reinterpret_cast<int8_t*>(hidden_state_scratch->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
 
   if (time_major) {
     // Initialize the pointer to hidden state.
@@ -233,7 +242,8 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input,
           input_ptr_batch, input_weights_ptr, input_weights_scale,
           recurrent_weights_ptr, recurrent_weights_scale, bias_ptr, input_size,
           num_units, batch_size, params->activation, quantized_input_ptr,
-          quantized_hidden_state_ptr, hidden_state_ptr_batch, output_ptr_batch);
+          quantized_hidden_state_ptr, scaling_factors_ptr,
+          hidden_state_ptr_batch, output_ptr_batch);
     }
   } else {
     // For each batch
@@ -252,7 +262,7 @@ TfLiteStatus EvalQuantized(const TfLiteTensor* input,
             recurrent_weights_ptr, recurrent_weights_scale, bias_ptr,
             input_size, num_units, /*batch_size=*/1, params->activation,
             quantized_input_ptr, quantized_hidden_state_ptr,
-            hidden_state_ptr_batch, output_ptr_batch);
+            scaling_factors_ptr, hidden_state_ptr_batch, output_ptr_batch);
       }
     }
   }
@@ -278,9 +288,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       // TODO(mirkov): implement eval with quantized inputs as well.
       TfLiteTensor* input_quantized = GetTemporary(context, node, 0);
       TfLiteTensor* hidden_state_quantized = GetTemporary(context, node, 1);
-      return EvalQuantized(input, input_weights, recurrent_weights, bias,
-                           params, input_quantized, hidden_state_quantized,
-                           hidden_state, output);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, 2);
+      return EvalHybrid(input, input_weights, recurrent_weights, bias, params,
+                        input_quantized, hidden_state_quantized,
+                        scaling_factors, hidden_state, output);
     }
     default:
       context->ReportError(context, "Type %d not currently supported.",
-- 
GitLab


From 4f6074494d4bf77daac5749224017615bfca239f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 15:17:52 -0700
Subject: [PATCH 121/610] Move reorder-cast-and-transpose optimization to
 optimization stage.

PiperOrigin-RevId: 198788352
---
 .../optimizers/arithmetic_optimizer.cc        | 154 +++++++++++-------
 .../optimizers/arithmetic_optimizer.h         |   1 +
 .../optimizers/arithmetic_optimizer_test.cc   |  55 ++++---
 3 files changed, 133 insertions(+), 77 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 0edea16aac..ca3f84a81d 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -194,8 +194,6 @@ void SetSourceDataType(DataType dtype, NodeDef* node) {
   SetDataTypeToAttr(dtype, SourceDataTypeAttrName(*node), node);
 }
 
-bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
-
 NodeDef* GetTailOfValuePreservingChain(
     const NodeDef& node, const NodeMap& node_map,
     const std::unordered_set<string>& nodes_to_preserve) {
@@ -1866,6 +1864,100 @@ class RemoveRedundantReshape : public ArithmeticOptimizerStage {
   }
 };
 
+// Reorder Cast and Transpose if beneficial.
+//
+// A common pattern after the layout optimizer is casting an uint8 NHWC
+// image to float before transposing it to NCHW. It is beneficial to reorder
+// the cast and the transpose to make the transpose process smaller amount
+// of data. This optimization converts
+//   Transpose(Cast(image, dst_type), perm)
+// to
+//   Cast(Transpose(image, perm), dst_type)
+// when sizeof(image.type) < sizeof(dst_type).
+//
+// TODO(jingyue): This optimization can be generalized to a cast followed by
+// a chain of ops that merely reorder elements (e.g. Reshape and
+// DepthToSpace).
+class ReorderCastAndTranspose : public ArithmeticOptimizerStage {
+ public:
+  explicit ReorderCastAndTranspose(const GraphOptimizerContext& ctx,
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ReorderCastAndTranspose", ctx, ctx_ext) {}
+  ~ReorderCastAndTranspose() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsTranspose(*node) && NodeIsOnCpuOrGpu(node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeDef* transpose = node;
+
+    // Verify that input to Transpose is the Cast op.
+    NodeDef* cast;
+    TF_RETURN_IF_ERROR(GetInputNode(transpose->input(0), &cast));
+    if (!IsCast(*cast)) return Status::OK();
+
+    // Input to the Cast-Transpose chain.
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(cast->input(0), &input));
+
+    const DataType src_type = GetSourceDataType(*cast);
+    const DataType dst_type = GetDestinationDataType(*cast);
+
+    const string src_type_name = DataTypeString(src_type);
+    const string dst_type_name = DataTypeString(dst_type);
+
+    // Check if nodes were not already optimized.
+    const string optimized_cast_name =
+        OptimizedNodeName(ParseNodeScopeAndName(cast->name()), dst_type_name);
+    const string optimized_transpose_name = OptimizedNodeName(
+        ParseNodeScopeAndName(transpose->name()), src_type_name);
+
+    bool is_already_optimized =
+        ctx().node_map->NodeExists(optimized_transpose_name) ||
+        ctx().node_map->NodeExists(optimized_cast_name);
+
+    if (IsNumberType(src_type) && IsNumberType(dst_type) &&
+        DataTypeSize(src_type) < DataTypeSize(dst_type) &&
+        !is_already_optimized) {
+      NodeDef* new_transpose = AddCopyNode(optimized_transpose_name, transpose);
+      (*new_transpose->mutable_attr())["T"].set_type(src_type);
+      new_transpose->set_input(0, cast->input(0));
+
+      ctx().node_map->AddOutput(input->name(), new_transpose->name());
+      ctx().node_map->AddOutput(NodeName(new_transpose->input(1)),
+                                new_transpose->name());
+
+      NodeDef* new_cast = AddCopyNode(optimized_cast_name, cast);
+      new_cast->set_input(0, new_transpose->name());
+      ctx().node_map->AddOutput(new_transpose->name(), new_cast->name());
+
+      AddToOptimizationQueue(new_transpose);
+      ForwardControlDependencies(new_transpose, {cast, node});
+
+      *simplified_node_name = new_cast->name();
+    }
+
+    return Status::OK();
+  }
+
+ private:
+  // This optimization can be dangerous on devices other than CPU and
+  // GPU. The transpose might not be implemented for image.type, or
+  // might be slower with image.type than with dst_type.
+  bool NodeIsOnCpuOrGpu(const NodeDef* node) const {
+    using str_util::StrContains;
+
+    string task;
+    string device;
+
+    return DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+           (StrContains(device, DEVICE_CPU) || StrContains(device, DEVICE_GPU));
+  }
+
+  bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2118,62 +2210,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
 // ArithmeticOptimizerStage
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-
-  if (node->op() == "Transpose") {
-    // Reorder Cast and Transpose if beneficial.
-    //
-    // A common pattern after the layout optimizer is casting an uint8 NHWC
-    // image to float before transposing it to NCHW. It is beneficial to reorder
-    // the cast and the transpose to make the transpose process smaller amount
-    // of data. This optimization converts
-    //   Transpose(Cast(image, dst_type), perm)
-    // to
-    //   Cast(Transpose(image, perm), dst_type)
-    // when sizeof(image.type) < sizeof(dst_type).
-    //
-    // TODO(jingyue): This optimization can be generalized to a cast followed by
-    // a chain of ops that merely reorder elements (e.g. Reshape and
-    // DepthToSpace).
-    const NodeDef* transpose = node;
-    string dontcare;
-    string device;
-    // This optimization can be dangerous on devices other than CPU and GPU. The
-    // transpose might not be implemented for image.type, or might be slower
-    // with image.type than with dst_type.
-    if (DeviceNameUtils::SplitDeviceName(transpose->device(), &dontcare,
-                                         &device) &&
-        (str_util::StrContains(device, DEVICE_CPU) ||
-         str_util::StrContains(device, DEVICE_GPU))) {
-      const NodeDef* cast = node_map_->GetNode(transpose->input(0));
-      if (cast->op() == "Cast") {
-        const NodeDef* input = node_map_->GetNode(cast->input(0));
-        const DataType src_type = GetSourceDataType(*cast);
-        const DataType dst_type = GetDestinationDataType(*cast);
-        if (IsNumberType(src_type) && IsNumberType(dst_type) &&
-            DataTypeSize(src_type) < DataTypeSize(dst_type) &&
-            !OptimizedNodeExists(*cast, DataTypeString(dst_type)) &&
-            !OptimizedNodeExists(*transpose, DataTypeString(src_type))) {
-          NodeDef* new_transpose = AddNode(*transpose, DataTypeString(src_type),
-                                           /*copy_node=*/true);
-          (*new_transpose->mutable_attr())["T"].set_type(src_type);
-          new_transpose->set_input(0, cast->input(0));
-          node_map_->AddOutput(input->name(), new_transpose->name());
-          node_map_->AddOutput(NodeName(new_transpose->input(1)),
-                               new_transpose->name());
-
-          NodeDef* new_cast =
-              AddNode(*cast, DataTypeString(dst_type), /*copy_node=*/true);
-          new_cast->set_input(0, new_transpose->name());
-          node_map_->AddOutput(new_transpose->name(), new_cast->name());
-
-          nodes_to_simplify->PushBack(new_transpose);
-          ForwardControlDependencies(new_transpose, {cast, node});
-          return new_cast->name();
-        }
-      }
-    }
-  }
-
   // Fold a multiply of a scalar into the following convolution. This folding
   // can jump across nodes that merely reorders data (such as reshape and
   // transpose). For example, we can optimize
@@ -2462,6 +2498,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
   if (options_.remove_logical_not)
     pipeline.AddStage<RemoveLogicalNotStage>(ctx, ctx_ext);
+  if (options_.reorder_cast_and_transpose)
+    pipeline.AddStage<ReorderCastAndTranspose>(ctx, ctx_ext);
   if (options_.hoist_cwise_unary_chains)
     pipeline.AddStage<HoistCWiseUnaryChainsStage>(ctx, ctx_ext);
   if (options_.convert_sqrt_div_to_rsqrt_mul)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 9f8ec85e77..0fce23a40a 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -72,6 +72,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_bitcast = true;
     bool remove_redundant_cast = true;
     bool remove_redundant_reshape = true;
+    bool reorder_cast_and_transpose = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 43355ef945..02f76df025 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -97,12 +97,22 @@ class ArithmeticOptimizerTest : public GrapplerTest {
   }
 
   // Run ArithmeticOptimizer twice to make sure the rewrite is idempotent.
+  // Optionally run a constant folding pass before pruning.
   void OptimizeTwiceAndPrune(ArithmeticOptimizer* optimizer, GrapplerItem* item,
-                             GraphDef* output) {
+                             GraphDef* output, bool const_folding = false) {
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
     item->graph.Swap(output);
     output->Clear();
     TF_EXPECT_OK(optimizer->Optimize(nullptr, *item, output));
+
+    if (const_folding) {
+      item->graph.Swap(output);
+      output->Clear();
+      TF_EXPECT_OK(ConstantFolding(/*cpu_device=*/nullptr)
+                       .Optimize(nullptr, *item, output));
+    }
+
     item->graph.Swap(output);
     output->Clear();
     TF_EXPECT_OK(ModelPruner().Optimize(nullptr, *item, output));
@@ -127,6 +137,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.remove_redundant_reshape = false;
     options.remove_negation = false;
     options.remove_logical_not = false;
+    options.reorder_cast_and_transpose = false;
     optimizer->options_ = options;
   }
 
@@ -179,6 +190,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.remove_negation = true;
   }
 
+  void EnableOnlyReorderCastAndTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.reorder_cast_and_transpose = true;
+  }
+
   void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.hoist_cwise_unary_chains = true;
@@ -1540,6 +1556,7 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   //     =>
   //   Conv2D(Cast(Transpose(I)), W*S)
   tensorflow::Scope s = tensorflow::Scope::NewRootScope().WithDevice("/gpu:0");
+
   Output inputs =
       ops::Placeholder(s, DT_UINT8, ops::Placeholder::Shape({8, 28, 28, 3}));
   Output cast = ops::Cast(s, inputs, DT_FLOAT);
@@ -1557,28 +1574,28 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  // Run the optimizer twice to make sure the rewrite is idempotent.
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  OptimizeTwiceAndPrune(&optimizer, &item, &output, /*const_folding=*/true);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(
-      ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
+  NodeMap node_map(&output);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  // Expected names for the optimized nodes.
+  const string p = "ArithmeticOptimizer/ReorderCastAndTranspose_";
+  const string optimized_cast_name = strings::StrCat(p, "float_Cast");
+  const string optimized_transpose_name = strings::StrCat(p, "uint8_Transpose");
 
-  NodeMap node_map(&output);
-  const NodeDef* inputs_node = CHECK_NOTNULL(node_map.GetNode("Placeholder"));
-  const NodeDef* transpose_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Transpose_uint8")));
-  const NodeDef* cast_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("Cast_float")));
+  const NodeDef* inputs_node = node_map.GetNode("Placeholder");
+  const NodeDef* transpose_node = node_map.GetNode(optimized_transpose_name);
+  const NodeDef* cast_node = node_map.GetNode(optimized_cast_name);
   const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
-  const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
+      node_map.GetNode(OptimizedName("weights_scaled_Conv2D"));
+  const NodeDef* conv_node = node_map.GetNode("Conv2D");
+
+  ASSERT_TRUE(inputs_node != nullptr);
+  ASSERT_TRUE(transpose_node != nullptr);
+  ASSERT_TRUE(cast_node != nullptr);
+  ASSERT_TRUE(weights_node != nullptr);
+  ASSERT_TRUE(conv_node != nullptr);
 
   EXPECT_EQ(output.node_size(), 7);
   EXPECT_EQ(transpose_node->input(0), inputs_node->name());
-- 
GitLab


From 28f8cf5cf2281682f70f4674192f9f31d68c5ee1 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 31 May 2018 15:25:10 -0700
Subject: [PATCH 122/610] [XLA] Check for identical backend configs in
 HloInstruction::Identical.

PiperOrigin-RevId: 198789495
---
 .../compiler/xla/service/hlo_instruction.h       |  4 ++++
 .../compiler/xla/service/hlo_instruction_test.cc | 16 ++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 72b9d545ae..d47af6c018 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -776,6 +776,10 @@ class HloInstruction {
       }
     }
 
+    if (backend_config_ != other.backend_config_) {
+      return false;
+    }
+
     return IdenticalSlowPath(other, eq_computations);
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index e91cf2076f..d1b6bc726d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -1542,5 +1542,21 @@ ENTRY entry (param: s32[]) -> s32[] {
   }
 }
 
+TEST_F(HloInstructionTest, IdenticalAccountsForBackendConfig) {
+  const Shape shape = ShapeUtil::MakeShape(F32, {42});
+  HloComputation::Builder builder("test");
+  HloInstruction* p =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p"));
+
+  HloInstruction* add1 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p, p));
+  HloInstruction* add2 = builder.AddInstruction(
+      HloInstruction::CreateBinary(shape, HloOpcode::kAdd, p, p));
+
+  EXPECT_TRUE(add1->Identical(*add2));
+  add1->set_raw_backend_config_string("abc");
+  EXPECT_FALSE(add1->Identical(*add2));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 6ca9a881ebd9bd3c7d4432dbddd779dafc8f936b Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Thu, 31 May 2018 15:50:55 -0700
Subject: [PATCH 123/610] Refactoring: Extract CombineHashes function into a
 shared module PiperOrigin-RevId: 198793295

---
 tensorflow/contrib/lite/op_resolver.h        |  4 ++--
 tensorflow/contrib/lite/toco/tflite/export.h | 21 +++++---------------
 tensorflow/contrib/lite/util.cc              | 10 ++++++++++
 tensorflow/contrib/lite/util.h               |  2 ++
 4 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/lite/op_resolver.h b/tensorflow/contrib/lite/op_resolver.h
index 38a2706942..9d7e3f2085 100644
--- a/tensorflow/contrib/lite/op_resolver.h
+++ b/tensorflow/contrib/lite/op_resolver.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <unordered_map>
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/schema/schema_generated.h"
+#include "tensorflow/contrib/lite/util.h"
 
 namespace tflite {
 
@@ -55,8 +56,7 @@ struct OperatorKeyHasher {
   size_t operator()(const T& x) const {
     size_t a = ValueHasher<typename T::first_type>()(x.first);
     size_t b = ValueHasher<typename T::second_type>()(x.second);
-    // Hash combinator used by TensorFlow core.
-    return a ^ (b + 0x9e3779b97f4a7800ULL + (a << 10) + (a >> 4));
+    return CombineHashes({a, b});
   }
 };
 }  // namespace op_resolver_hasher
diff --git a/tensorflow/contrib/lite/toco/tflite/export.h b/tensorflow/contrib/lite/toco/tflite/export.h
index 90abfb94d8..098d2163e6 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.h
+++ b/tensorflow/contrib/lite/toco/tflite/export.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/tflite/operator.h"
+#include "tensorflow/contrib/lite/util.h"
 
 namespace toco {
 
@@ -72,22 +73,10 @@ struct OperatorKey {
 
   struct Hash {
     size_t operator()(const OperatorKey& key) const {
-      return CombineHashes({std::hash<size_t>()(static_cast<size_t>(key.type)),
-                            std::hash<std::string>()(key.custom_code),
-                            std::hash<int>()(key.version)});
-    }
-
-   private:
-    // TODO(ycling): Refactoring and extract this function into a common
-    // utility module.
-    static size_t CombineHashes(std::initializer_list<size_t> hashes) {
-      size_t result = 0;
-      // Hash combiner used by TensorFlow core.
-      for (size_t hash : hashes) {
-        result = result ^ (hash + 0x9e3779b97f4a7800ULL + (result << 10) +
-                           (result >> 4));
-      }
-      return result;
+      return ::tflite::CombineHashes(
+          {std::hash<size_t>()(static_cast<size_t>(key.type)),
+           std::hash<std::string>()(key.custom_code),
+           std::hash<int>()(key.version)});
     }
   };
 };
diff --git a/tensorflow/contrib/lite/util.cc b/tensorflow/contrib/lite/util.cc
index fb4af07d06..8ccb65c24f 100644
--- a/tensorflow/contrib/lite/util.cc
+++ b/tensorflow/contrib/lite/util.cc
@@ -38,4 +38,14 @@ bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
   return true;
 }
 
+size_t CombineHashes(std::initializer_list<size_t> hashes) {
+  size_t result = 0;
+  // Hash combiner used by TensorFlow core.
+  for (size_t hash : hashes) {
+    result = result ^
+             (hash + 0x9e3779b97f4a7800ULL + (result << 10) + (result >> 4));
+  }
+  return result;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/util.h b/tensorflow/contrib/lite/util.h
index a34db35823..89d9b4f5cf 100644
--- a/tensorflow/contrib/lite/util.h
+++ b/tensorflow/contrib/lite/util.h
@@ -35,6 +35,8 @@ TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int rank, const int* dims);
 bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
                                  const int* b);
 
+size_t CombineHashes(std::initializer_list<size_t> hashes);
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_UTIL_H_
-- 
GitLab


From b9b49d43e4d8a07b493416733f14214fb49e1e5d Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Thu, 31 May 2018 15:52:15 -0700
Subject: [PATCH 124/610] Add warning for gcs_config_ops

PiperOrigin-RevId: 198793502
---
 .../contrib/cloud/python/ops/gcs_config_ops.py       | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
index 9ab124ae72..8c8c5acb31 100644
--- a/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
+++ b/tensorflow/contrib/cloud/python/ops/gcs_config_ops.py
@@ -53,6 +53,12 @@ class BlockCacheParams(object):
 class ConfigureGcsHook(training.SessionRunHook):
   """ConfigureGcsHook configures GCS when used with Estimator/TPUEstimator.
 
+  Warning: GCS `credentials` may be transmitted over the network unencrypted.
+  Please ensure that the network is trusted before using this function. For
+  users running code entirely within Google Cloud, your data is protected by
+  encryption in between data centers. For more information, please take a look
+  at https://cloud.google.com/security/encryption-in-transit/.
+
   Example:
 
   ```
@@ -135,6 +141,12 @@ class ConfigureGcsHook(training.SessionRunHook):
 def configure_gcs(session, credentials=None, block_cache=None, device=None):
   """Configures the GCS file system for a given a session.
 
+  Warning: GCS `credentials` may be transmitted over the network unencrypted.
+  Please ensure that the network is trusted before using this function. For
+  users running code entirely within Google Cloud, your data is protected by
+  encryption in between data centers. For more information, please take a look
+  at https://cloud.google.com/security/encryption-in-transit/.
+
   Args:
     session: A `tf.Session` session that should be used to configure the GCS
       file system.
-- 
GitLab


From 1316a49c3723d19e5312bbfd4eca237ea3c982c5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 16:01:23 -0700
Subject: [PATCH 125/610] Putting stubs for function shape inference interface

PiperOrigin-RevId: 198794845
---
 .../core/grappler/costs/graph_properties.cc   | 23 +++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 203f7b09e3..5310c9ebdf 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
@@ -425,6 +426,13 @@ class SymbolicShapeRefiner {
     return it->second.inference_context.get();
   }
 
+  // Forward the shapes from the function's fanin to the function body,
+  // then call PropagateShapes.
+  // Returns an error if 'node' is not a function node.
+  Status UpdateFunction(const NodeDef* node, bool* refined) {
+    return UpdateNode(node, refined);
+  }
+
   Status UpdateNode(const NodeDef* node, bool* refined) {
     NodeContext* node_context = GetNodeContext(node);
     if (node_context == nullptr) {
@@ -678,10 +686,16 @@ class SymbolicShapeRefiner {
     return true;
   }
 
+  Status AddFunction(const NodeDef* node) { return Status::OK(); }
+
   Status AddNode(const NodeDef* node) {
     NodeContext& node_ctx = node_to_context_[node];
     TF_RETURN_IF_ERROR(function_library_.LookUp(node->op(), &node_ctx.op_data));
 
+    if (node_ctx.op_data->is_function_op) {
+      TF_RETURN_IF_ERROR(AddFunction(node));
+    }
+
     TF_RETURN_IF_ERROR(InOutTypesForNode(*node, node_ctx.op_data->op_def,
                                          &node_ctx.input_types,
                                          &node_ctx.output_types));
@@ -1070,8 +1084,13 @@ Status GraphProperties::UpdateShapes(
     TF_RETURN_IF_ERROR(
         UpdateEnqueue(n, resource_handles, shape_refiner, new_shapes));
   } else {
-    // Rely on regular TF shape refinement for all the other nodes.
-    TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, new_shapes));
+    auto c = shape_refiner->GetNodeContext(n);
+    if (c && c->op_data && c->op_data->is_function_op) {
+      TF_RETURN_IF_ERROR(shape_refiner->UpdateFunction(n, new_shapes));
+    } else {
+      // Rely on regular TF shape refinement for all the other nodes.
+      TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, new_shapes));
+    }
   }
   return Status::OK();
 }
-- 
GitLab


From 922563620d7e1f50ffbceec027e6a7158d81c69f Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Thu, 31 May 2018 16:01:50 -0700
Subject: [PATCH 126/610] Fix one comment in prefetch_autotuner_test.cc.

PiperOrigin-RevId: 198794897
---
 tensorflow/core/kernels/data/prefetch_autotuner_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
index 2f573dfb35..29a8cc50cd 100644
--- a/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_autotuner_test.cc
@@ -33,7 +33,7 @@ TEST(PrefetchAutotuner, Disabled) {
 TEST(PrefetchAutotuner, Enabled) {
   PrefetchAutotuner t(PrefetchAutotuner::kAutoTune);
   EXPECT_EQ(1, t.buffer_limit());
-  t.RecordConsumption(0);  // Expect buffer limit to increase.
+  t.RecordConsumption(0);  // Expect buffer limit to stay the same.
   EXPECT_EQ(1, t.buffer_limit());
   t.RecordConsumption(1);
   EXPECT_EQ(1, t.buffer_limit());
-- 
GitLab


From 05c050218b676227fbc0fd24e053f76380ac218e Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 31 May 2018 16:02:26 -0700
Subject: [PATCH 127/610] [XLA:GPU] Specify cudnn conv algorithm via
 backend_config.

Gets rid of the tricky algorithm/use-tensor-cores operands to cudnn
convolution customcalls, using instead a backend_config.

PiperOrigin-RevId: 198794988
---
 tensorflow/compiler/xla/service/gpu/BUILD     | 10 +++++++
 .../xla/service/gpu/backend_configs.proto     | 27 +++++++++++++++++++
 .../gpu/cudnn_convolution_algorithm_picker.cc | 14 +++++-----
 .../xla/service/gpu/gpu_copy_insertion.cc     |  6 -----
 .../xla/service/gpu/ir_emission_utils.cc      | 15 ++---------
 .../xla/service/gpu/ir_emitter_unnested.cc    | 21 +++++++--------
 6 files changed, 55 insertions(+), 38 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/backend_configs.proto

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 2794930248..68297ad4ae 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -1,6 +1,8 @@
 # Description:
 #   GPU-specific components in XLA service implementation.
 
+load("//tensorflow/compiler/xla:xla.bzl", "xla_proto_library")
+
 licenses(["notice"])  # Apache 2.0
 
 package(default_visibility = [":friends"])
@@ -23,6 +25,11 @@ filegroup(
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+xla_proto_library(
+    name = "backend_configs",
+    srcs = ["backend_configs.proto"],
+)
+
 cc_library(
     name = "gpu_constants",
     srcs = ["gpu_constants.cc"],
@@ -133,6 +140,7 @@ cc_library(
         "ir_emitter_unnested.h",
     ],
     deps = [
+        ":backend_configs",
         ":cudnn_convolution_runner",
         ":elemental_ir_emitter",
         ":gpu_constants",
@@ -266,6 +274,7 @@ cc_library(
         "while_thunk.h",
     ],
     deps = [
+        ":backend_configs",
         ":buffer_allocations",
         ":cudnn_convolution_runner",
         ":infeed_manager",
@@ -322,6 +331,7 @@ cc_library(
     srcs = ["cudnn_convolution_algorithm_picker.cc"],
     hdrs = ["cudnn_convolution_algorithm_picker.h"],
     deps = [
+        ":backend_configs",
         ":cudnn_convolution_runner",
         ":gpu_executable",
         ":ir_emission_utils",
diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs.proto b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
new file mode 100644
index 0000000000..640c6392b8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
@@ -0,0 +1,27 @@
+syntax = "proto3";
+
+package xla.gpu;
+
+// Backend configs for XLA:GPU.
+//
+// These are metadata that the GPU backend attaches to HloInstrucitons and later
+// uses during e.g. codegen.
+//
+// Remember that proto3 doesn't give clients a way to tell the difference
+// between a field not being present and a field having the default value.
+// Choose your defaults carefully.
+//
+// No guarantee is made about the stability of these protos.
+//
+// See HloInstruction::backend_config() for more info.
+
+// Backend config for a convolution that runs through cudnn.
+message CudnnConvBackendConfig {
+  // Opaque algorithm number of cudnn algorithm chosen for this conv.
+  int64 algorithm = 1;
+
+  // Whether we may use tensor cores when running this conv.  Even if this is
+  // true, cudnn may choose not to use tensor cores, e.g. because the GPU or
+  // selected algorithm doesn't support it.
+  bool tensor_ops_enabled = 2;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
index 6a46bdb9b4..3dc98c4c93 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gpu/cudnn_convolution_algorithm_picker.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/core/lib/gtl/optional.h"
@@ -316,21 +317,20 @@ StatusOr<bool> CudnnConvolutionAlgorithmPicker::RunOnInstruction(
   Shape new_call_shape =
       ShapeUtil::MakeTupleShape({instr->shape().tuple_shapes(0),
                                  ShapeUtil::MakeShape(U8, {scratch_bytes})});
-  HloInstruction* algorithm_hlo = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(algorithm)));
-  HloInstruction* tensor_ops_enabled_hlo =
-      computation->AddInstruction(HloInstruction::CreateConstant(
-          Literal::CreateR0<bool>(tensor_ops_enabled)));
+
+  CudnnConvBackendConfig backend_config;
+  backend_config.set_algorithm(algorithm);
+  backend_config.set_tensor_ops_enabled(tensor_ops_enabled);
 
   HloInstruction* new_call =
       computation->AddInstruction(HloInstruction::CreateCustomCall(
           new_call_shape,
-          {instr->mutable_operand(0), instr->mutable_operand(1), algorithm_hlo,
-           tensor_ops_enabled_hlo},
+          {instr->mutable_operand(0), instr->mutable_operand(1)},
           instr->custom_call_target()));
   new_call->set_window(instr->window());
   new_call->set_convolution_dimension_numbers(
       instr->convolution_dimension_numbers());
+  TF_RETURN_IF_ERROR(new_call->set_backend_config(backend_config));
 
   // Repackage new_call so it has the same shape as the original call, namely
   // (conv_result, u8[0]).
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
index d9560779f3..c5ccdd4a7d 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion.cc
@@ -78,12 +78,6 @@ StatusOr<bool> GpuCopyInsertion::Run(HloModule* module) {
       for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
         TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
       }
-    } else if (IsCustomCallToDnnConvolution(*hlo)) {
-      // The last two arguments to a CUDNN convolution are two HLO constants for
-      // cudnn algorithm and tensor_ops_enabled flag, which shouldn't be copied.
-      for (int64 i = 0; i < hlo->operand_count() - 2; ++i) {
-        TF_RETURN_IF_ERROR(copy_operand_if_constant(i));
-      }
     } else if (ImplementedAsLibraryCall(*hlo) ||
                hlo->opcode() == HloOpcode::kCrossReplicaSum) {
       // For all other library calls and cross-replica-sum, materialize all the
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 22e7150995..67890bfed1 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -162,19 +162,8 @@ static HloInstruction* CreateCudnnConv(
   Shape call_shape =
       ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U8, {0})});
 
-  // Our CustomCall takes four arguments: The conv lhs and rhs, the cudnn
-  // algorithm to use, and a boolean indicating whether to use tensor cores.
-  //
-  // It's up to a later pass to choose the algorithm and decide whether to use
-  // tensor cores, so to indicate that we haven't yet made a choice, we speicfy
-  // -1 and false for those args.
-  HloInstruction* negative_one = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<int64>(-1)));
-  HloInstruction* false_constant = computation->AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<bool>(false)));
-  HloInstruction* custom_call =
-      computation->AddInstruction(HloInstruction::CreateCustomCall(
-          call_shape, {lhs, rhs, negative_one, false_constant}, call_target));
+  HloInstruction* custom_call = computation->AddInstruction(
+      HloInstruction::CreateCustomCall(call_shape, {lhs, rhs}, call_target));
   custom_call->set_window(window);
   custom_call->set_convolution_dimension_numbers(dnums);
   return custom_call;
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ae4e305b80..0f5c003341 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/conditional_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/convolution_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/copy_thunk.h"
@@ -423,15 +424,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
     auto conv_result_slice = assn.GetUniqueSlice(custom_call, {0}).ValueOrDie();
     auto scratch_slice = assn.GetUniqueSlice(custom_call, {1}).ValueOrDie();
 
-    const HloInstruction* algorithm_inst = custom_call->operand(2);
-    CHECK(algorithm_inst->IsConstant()) << algorithm_inst->ToString();
-    int64 algorithm = algorithm_inst->literal().Get<int64>({});
-
-    const HloInstruction* tensor_ops_enabled_inst = custom_call->operand(3);
-    CHECK(tensor_ops_enabled_inst->IsConstant())
-        << tensor_ops_enabled_inst->ToString();
-    bool tensor_ops_enabled = tensor_ops_enabled_inst->literal().Get<bool>({});
-
+    TF_ASSIGN_OR_RETURN(CudnnConvBackendConfig backend_config,
+                        custom_call->backend_config<CudnnConvBackendConfig>());
     const auto& target = custom_call->custom_call_target();
     std::unique_ptr<ConvolutionThunk> thunk;
     if (target == kCudnnConvForwardCallTarget) {
@@ -446,7 +440,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           /*filter_shape=*/rhs_shape,
           /*output_shape=*/conv_result_shape,  //
           custom_call->window(), custom_call->convolution_dimension_numbers(),
-          algorithm, tensor_ops_enabled, custom_call);
+          backend_config.algorithm(), backend_config.tensor_ops_enabled(),
+          custom_call);
     } else if (target == kCudnnConvBackwardInputCallTarget) {
       thunk = MakeUnique<ConvolutionThunk>(
           CudnnConvKind::kBackwardInput,
@@ -459,7 +454,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           /*filter_shape=*/rhs_shape,
           /*output_shape=*/lhs_shape,  //
           custom_call->window(), custom_call->convolution_dimension_numbers(),
-          algorithm, tensor_ops_enabled, custom_call);
+          backend_config.algorithm(), backend_config.tensor_ops_enabled(),
+          custom_call);
     } else if (target == kCudnnConvBackwardFilterCallTarget) {
       thunk = MakeUnique<ConvolutionThunk>(
           CudnnConvKind::kBackwardFilter,
@@ -472,7 +468,8 @@ Status IrEmitterUnnested::HandleCustomCall(HloInstruction* custom_call) {
           /*filter_shape=*/conv_result_shape,
           /*output_shape=*/rhs_shape,  //
           custom_call->window(), custom_call->convolution_dimension_numbers(),
-          algorithm, tensor_ops_enabled, custom_call);
+          backend_config.algorithm(), backend_config.tensor_ops_enabled(),
+          custom_call);
     } else {
       LOG(FATAL) << "Unexpected custom call target: "
                  << custom_call->custom_call_target();
-- 
GitLab


From 2c38e7c770c3b4a32a123452ced31e24a0297342 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 31 May 2018 16:06:15 -0700
Subject: [PATCH 128/610] Add utility for converting FunctionDef to GraphDef
 and _FuncGraph.

PiperOrigin-RevId: 198795625
---
 tensorflow/python/BUILD                       |  32 +++
 .../python/framework/function_def_to_graph.py | 189 ++++++++++++++++++
 .../framework/function_def_to_graph_test.py   | 184 +++++++++++++++++
 3 files changed, 405 insertions(+)
 create mode 100644 tensorflow/python/framework/function_def_to_graph.py
 create mode 100644 tensorflow/python/framework/function_def_to_graph_test.py

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index b15c5291f5..569403fa9a 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -717,6 +717,38 @@ py_library(
     ],
 )
 
+py_library(
+    name = "function_def_to_graph",
+    srcs = ["framework/function_def_to_graph.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":function",
+        ":op_def_registry",
+        ":tensor_shape",
+        ":versions",
+        "//tensorflow/core:protos_all_py",
+    ],
+)
+
+py_test(
+    name = "function_def_to_graph_test",
+    size = "small",
+    srcs = ["framework/function_def_to_graph_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":dtypes",
+        ":framework_ops",
+        ":function_def_to_graph",
+        ":graph_to_function_def",
+        ":math_ops",
+        ":test_ops",
+    ],
+)
+
 py_library(
     name = "graph_util",
     srcs = [
diff --git a/tensorflow/python/framework/function_def_to_graph.py b/tensorflow/python/framework/function_def_to_graph.py
new file mode 100644
index 0000000000..4fecc41343
--- /dev/null
+++ b/tensorflow/python/framework/function_def_to_graph.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Utlity to convert FunctionDef to GraphDef and Graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import types_pb2
+from tensorflow.core.framework import versions_pb2
+from tensorflow.python.framework import function
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import op_def_registry
+from tensorflow.python.framework import versions
+
+
+def function_def_to_graph(fdef, input_shapes=None):
+  """Converts a FunctionDef to a function._FuncGraph (sub-class Graph).
+
+  The returned _FuncGraph's `name`, `inputs` and `outputs` fields will be set.
+  The input tensors are represented as placeholders.
+
+  Note: `_FuncGraph.inputs` and `_FuncGraph._captured` are not set and may be
+  set by the caller.
+
+  Args:
+    fdef: FunctionDef.
+    input_shapes: Optional. A list of TensorShape objects of the shapes of
+      function inputs. If specified, its length must match length of
+      `fdef.signature.input_arg`. If a shape is None, the corresponding input
+      placeholder will have unknown shape.
+
+  Returns:
+    A _FuncGraph.
+  """
+  func_graph = function._FuncGraph(fdef.signature.name, capture_by_value=False)  # pylint: disable=protected-access
+  graph_def, nested_to_flat_tensor_name = function_def_to_graph_def(
+      fdef, input_shapes)
+
+  with func_graph.as_default():
+    # Add all function nodes to the graph.
+    importer.import_graph_def(graph_def, name="")
+
+    # Initialize fields specific to _FuncGraph.
+
+    # inputs
+    input_tensor_names = [
+        nested_to_flat_tensor_name[arg.name] for arg in fdef.signature.input_arg
+    ]
+    func_graph.inputs = [
+        func_graph.get_tensor_by_name(name) for name in input_tensor_names
+    ]
+
+    # outputs
+    output_tensor_names = [
+        nested_to_flat_tensor_name[fdef.ret[arg.name]]
+        for arg in fdef.signature.output_arg
+    ]
+    func_graph.outputs = [
+        func_graph.get_tensor_by_name(name) for name in output_tensor_names
+    ]
+
+  return func_graph
+
+
+def function_def_to_graph_def(fdef, input_shapes=None):
+  """Convert a FunctionDef to a GraphDef.
+
+  Steps:
+  1. Creates placeholder nodes corresponding to inputs in
+     `FunctionDef.signature.input_arg`.
+  2. Adds NodeDefs in `FunctionDef.node_def` to `GraphDef.node`.
+  3. Renames inputs of all nodes to use the convention of GraphDef instead of
+     FunctionDef. See comment on `FunctionDef.node_def` on how the tensor naming
+     in FunctionDefs is different from GraphDefs.
+
+  Args:
+    fdef: FunctionDef.
+    input_shapes: Optional. A list of TensorShape objects of the shapes of
+      function inputs. If specified, its length must match length of
+      `fdef.signature.input_arg`. If a shape is None, the corresponding input
+      placeholder will have unknown shape.
+
+  Returns:
+    A tuple of (GraphDef, dict<string, string>). The dict contains a mapping
+    from nested tensor names (in FunctionDef) to flattened names (in GraphDef).
+
+  Raises:
+    ValueError: If the length of input_shapes does not match the number of
+      input_args or if the FunctionDef is invalid.
+  """
+  graph_def = graph_pb2.GraphDef()
+  graph_def.versions.CopyFrom(
+      versions_pb2.VersionDef(
+          producer=versions.GRAPH_DEF_VERSION,
+          min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER))
+
+  if input_shapes and len(input_shapes) != len(fdef.signature.input_arg):
+    raise ValueError("Length of input_shapes must match the number of " +
+                     "input_args. len(input_shapes): {} len(input_arg): {}".
+                     format(len(input_shapes), len(fdef.signature.input_arg)))
+
+  # 1. Create placeholders for input nodes.
+  for i, arg_def in enumerate(fdef.signature.input_arg):
+    node_def = graph_def.node.add()
+    node_def.name = arg_def.name
+    node_def.op = "Placeholder"
+    node_def.attr["dtype"].type = arg_def.type
+    if input_shapes and input_shapes[i] is not None:
+      node_def.attr["shape"].shape.CopyFrom(input_shapes[i].as_proto())
+
+  # 2. Copy all body NodeDefs to the GraphDef.
+  graph_def.node.extend(fdef.node_def)
+
+  # 3. Perform the renaming.
+
+  # Build the tensor name mapping then flatten the tensor names.
+  # See comment on `FunctionDef.node_def` on how the tensor naming in
+  # FunctionDefs is different from GraphDefs.
+  nested_to_flat_tensor_name = {}
+
+  for arg_def in fdef.signature.input_arg:
+    nested_to_flat_tensor_name[arg_def.name] = "{}:0".format(arg_def.name)
+
+  for node_def in fdef.node_def:
+    op_def = op_def_registry.get_registered_ops().get(node_def.op)
+    if not op_def:
+      # TODO(b/80470245): Support functions which refer other functions.
+      raise NotImplementedError(
+          "No op registered for {},".format(node_def.op) +
+          " it may be a function. function_def_to_graph_def " +
+          "currently does not support converting functions with " +
+          "references to other graph functions.")
+
+    for attr in op_def.attr:
+      if attr.type in ("func", "list(func)"):
+        # TODO(b/80470245): Support functions which refer other functions.
+        raise NotImplementedError("Unsupported attr {} ".format(attr.name) +
+                                  " with type {}".format(attr.type) +
+                                  " in op {}. ".format(op_def.name) +
+                                  "function_def_to_graph_def currently does " +
+                                  "not support converting functions with " +
+                                  "references to other graph functions.")
+
+    # Iterate over output_args in op_def to build the map.
+    # Index of the output tensor in the flattened list of *all* output
+    # tensors of the op.
+    flattened_index = 0
+    for arg_def in op_def.output_arg:
+      num_args = _get_num_args(arg_def, node_def)
+      for i in range(num_args):
+        # Map tensor names from "node_name:output_arg_name:index" to
+        # "node_name:flattened_index".
+        nested_name = "{}:{}:{}".format(node_def.name, arg_def.name, i)
+        flat_name = "{}:{}".format(node_def.name, flattened_index)
+        nested_to_flat_tensor_name[nested_name] = flat_name
+        flattened_index += 1
+
+  # Update inputs of all nodes in graph.
+  for node_def in graph_def.node:
+    for i in range(len(node_def.input)):
+      node_def.input[i] = nested_to_flat_tensor_name[node_def.input[i]]
+
+  return graph_def, nested_to_flat_tensor_name
+
+
+# Based on implementation in core/framework/node_def_util.cc::ComputeArgRange.
+def _get_num_args(arg_def, node_def):
+  if arg_def.number_attr:
+    return node_def.attr[arg_def.number_attr].i
+  elif arg_def.type_list_attr:
+    return len(node_def.attr[arg_def.type_list_attr].list.type)
+  elif arg_def.type_attr or arg_def.type != types_pb2.DT_INVALID:
+    return 1
+  else:
+    raise ValueError("Invalid arg_def:\n\n{}".format(str(arg_def)))
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
new file mode 100644
index 0000000000..0f4e6ef54f
--- /dev/null
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -0,0 +1,184 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.python.framework.function_def_to_graph."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function_def_to_graph
+from tensorflow.python.framework import graph_to_function_def
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class FunctionDefToGraphTest(test.TestCase):
+
+  def _build_function_def(self):
+    with ops.Graph().as_default() as g:
+      # Inputs
+      x = array_ops.placeholder(dtypes.float32, name="x")
+      y = array_ops.placeholder(dtypes.float32, name="y")
+
+      # Outputs
+      sum_squares = math_ops.add_n(
+          [math_ops.pow(x, 2), math_ops.pow(y, 2)], name="sum_squares")
+      sum_cubes = math_ops.add_n(
+          [math_ops.pow(x, 3), math_ops.pow(y, 3)], name="sum_cubes")
+    fdef = graph_to_function_def.graph_to_function_def(
+        g,
+        g.get_operations(),
+        [x, y],  # Inputs
+        [sum_squares, sum_cubes])  # Outputs.
+    fdef.signature.name = "_whats_in_a_name"
+    return fdef
+
+  def testInputsAndOutputs(self):
+    fdef = self._build_function_def()
+    g = function_def_to_graph.function_def_to_graph(fdef)
+    self.assertEqual(g.name, "_whats_in_a_name")
+    with self.test_session(graph=g) as sess:
+      inputs = sess.run(g.inputs, feed_dict={"x:0": 2, "y:0": 3})
+      self.assertSequenceEqual(inputs, [2.0, 3.0])
+      outputs = sess.run(g.outputs, feed_dict={"x:0": 2, "y:0": 3})
+      self.assertSequenceEqual(outputs, [13.0, 35.0])
+
+  def testShapes(self):
+    fdef = self._build_function_def()
+
+    g = function_def_to_graph.function_def_to_graph(fdef)
+    self.assertIsNone(g.inputs[0].shape.dims)  # Unknown dims.
+    self.assertIsNone(g.inputs[1].shape.dims)  # Unknown dims.
+    self.assertIsNone(g.outputs[0].shape.dims)  # Unknown dims.
+    self.assertIsNone(g.outputs[1].shape.dims)  # Unknown dims.
+
+    g = function_def_to_graph.function_def_to_graph(
+        fdef, input_shapes=[tensor_shape.vector(5),
+                            tensor_shape.vector(5)])
+    self.assertSequenceEqual(g.inputs[0].shape.dims, [5])
+    self.assertSequenceEqual(g.inputs[1].shape.dims, [5])
+    self.assertSequenceEqual(g.outputs[0].shape.dims, [5])
+    self.assertSequenceEqual(g.outputs[1].shape.dims, [5])
+
+    g = function_def_to_graph.function_def_to_graph(
+        fdef, input_shapes=[None, tensor_shape.matrix(5, 7)])
+    print(g.as_graph_def())
+    self.assertIsNone(g.inputs[0].shape.dims)
+    self.assertSequenceEqual(g.inputs[1].shape.dims, [5, 7])
+    self.assertSequenceEqual(g.outputs[0].shape.dims, [5, 7])
+    self.assertSequenceEqual(g.outputs[1].shape.dims, [5, 7])
+
+    # Should raise a ValueError if the length of input_shapes does not match
+    # the number of input args in FunctionDef.signature.input_arg.
+    with self.assertRaises(ValueError):
+      g = function_def_to_graph.function_def_to_graph(
+          fdef, input_shapes=[tensor_shape.matrix(5, 7)])
+
+
+class FunctionDefToGraphDefTest(test.TestCase):
+
+  def _build_function_def(self):
+    with ops.Graph().as_default() as g:
+      # Inputs:    x    y    z
+      #            |\   |   /
+      #            | \  |  /
+      #            |  foo_1     list_output
+      #            |   / \       /       \
+      #            | d_1 e_1  a:1        a:0
+      #            |  \   |   /           |
+      #            |   \  |  /            |
+      #            |    foo_2             |
+      #            |     / \              |
+      # Outputs:   x   d_2 e_2           a:0
+
+      x = array_ops.placeholder(dtypes.float32, name="x")
+      y = array_ops.placeholder(dtypes.int32, name="y")
+      z = array_ops.placeholder(dtypes.int32, name="z")
+
+      d_1, e_1 = test_ops._op_def_lib.apply_op(
+          "Foo1", name="foo_1", a=x, b=y, c=z)
+
+      list_output0, list_output1 = test_ops.list_output(
+          T=[dtypes.int32, dtypes.int32], name="list_output")
+
+      d_2, e_2 = test_ops.foo1(a=d_1, b=e_1, c=list_output1, name="foo_2")
+
+    fdef = graph_to_function_def.graph_to_function_def(
+        g,
+        g.get_operations(),
+        [x, y, z],  # Inputs
+        [x, d_2, e_2, list_output0])  # Outputs.
+
+    # Assert that the FunctionDef was correctly built.
+    assert len(fdef.node_def) == 3  # 2 Foo1 nodes and 1 ListOutput node.
+    assert fdef.node_def[0].op == "Foo1"
+    assert fdef.node_def[0].input == ["x", "y", "z"]
+    assert fdef.node_def[1].op == "ListOutput"
+    assert not fdef.node_def[1].input
+    assert fdef.node_def[2].op == "Foo1"
+    assert fdef.node_def[2].input == [
+        "foo_1:d:0", "foo_1:e:0", "list_output:a:1"
+    ]
+    return fdef
+
+  def testTensorNames(self):
+    fdef = self._build_function_def()
+    g, tensor_name_map = function_def_to_graph.function_def_to_graph_def(fdef)
+
+    # Verify that inputs of body nodes are correctly renamed.
+    # foo_1
+    self.assertSequenceEqual(g.node[3].input, ["x:0", "y:0", "z:0"])
+    # foo_2
+    self.assertSequenceEqual(g.node[5].input,
+                             ["foo_1:0", "foo_1:1", "list_output:1"])
+
+    # Verify that the `tensor_name_map` has the correct mapping.
+    self.assertDictEqual(
+        tensor_name_map, {
+            "x": "x:0",
+            "y": "y:0",
+            "z": "z:0",
+            "foo_1:d:0": "foo_1:0",
+            "foo_1:e:0": "foo_1:1",
+            "list_output:a:0": "list_output:0",
+            "list_output:a:1": "list_output:1",
+            "foo_2:d:0": "foo_2:0",
+            "foo_2:e:0": "foo_2:1",
+        })
+
+  def testShapes(self):
+    fdef = self._build_function_def()
+    g, _ = function_def_to_graph.function_def_to_graph_def(
+        fdef,
+        input_shapes=[tensor_shape.scalar(),
+                      tensor_shape.vector(5), None])
+    self.assertEqual("shape" in g.node[0].attr, True)
+    self.assertSequenceEqual(
+        tensor_shape.TensorShape(g.node[0].attr["shape"].shape).as_list(), [])
+    self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
+    self.assertEqual("shape" in g.node[1].attr, True)
+    self.assertSequenceEqual(
+        tensor_shape.TensorShape(g.node[1].attr["shape"].shape).as_list(), [5])
+    self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
+    self.assertFalse("shape" in g.node[2].attr)
+
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From cd37c5277fa7cf1bb1e1c7ace3922109f6fc7fc2 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 31 May 2018 16:07:02 -0700
Subject: [PATCH 129/610] Fixed Python API.

PiperOrigin-RevId: 198795738
---
 tensorflow/contrib/lite/python/lite.py         | 14 +++++++-------
 tensorflow/contrib/lite/python/lite_test.py    | 18 +++++++++---------
 .../contrib/lite/python/tflite_convert.py      |  2 +-
 .../contrib/lite/toco/g3doc/python_api.md      |  2 +-
 4 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index d55d8a6f6c..253b5eadf3 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -101,7 +101,7 @@ class TocoConverter(object):
     open("converted_model.tflite", "wb").write(tflite_model)
 
     # Converting a GraphDef from file.
-    converter = lite.TocoConverter.from_flatbuffer_file(
+    converter = lite.TocoConverter.from_frozen_graph(
       graph_def_file, input_arrays, output_arrays)
     tflite_model = converter.convert()
     open("converted_model.tflite", "wb").write(tflite_model)
@@ -151,12 +151,12 @@ class TocoConverter(object):
     return cls(graph_def, input_tensors, output_tensors)
 
   @classmethod
-  def from_flatbuffer_file(cls,
-                           graph_def_file,
-                           input_arrays,
-                           output_arrays,
-                           input_shapes=None):
-    """Creates a TocoConverter class from a file containing a GraphDef.
+  def from_frozen_graph(cls,
+                        graph_def_file,
+                        input_arrays,
+                        output_arrays,
+                        input_shapes=None):
+    """Creates a TocoConverter class from a file containing a frozen GraphDef.
 
     Args:
       graph_def_file: Full filepath of file containing TensorFlow GraphDef.
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 1b0cdb90ce..53d1878293 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -295,8 +295,8 @@ class FromFlatbufferFile(test_util.TensorFlowTestCase):
     write_graph(sess.graph_def, '', graph_def_file, False)
 
     # Convert model and ensure model is not None.
-    converter = lite.TocoConverter.from_flatbuffer_file(
-        graph_def_file, ['Placeholder'], ['add'])
+    converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
+                                                     ['Placeholder'], ['add'])
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -329,7 +329,7 @@ class FromFlatbufferFile(test_util.TensorFlowTestCase):
     write_graph(sess.graph_def, '', graph_def_file, False)
 
     # Convert model and ensure model is not None.
-    converter = lite.TocoConverter.from_flatbuffer_file(
+    converter = lite.TocoConverter.from_frozen_graph(
         graph_def_file, ['Placeholder'], ['add'],
         input_shapes={'Placeholder': [1, 16, 16, 3]})
     tflite_model = converter.convert()
@@ -357,8 +357,8 @@ class FromFlatbufferFile(test_util.TensorFlowTestCase):
 
     # Ensure the graph with variables cannot be converted.
     with self.assertRaises(ValueError) as error:
-      lite.TocoConverter.from_flatbuffer_file(graph_def_file, ['Placeholder'],
-                                              ['add'])
+      lite.TocoConverter.from_frozen_graph(graph_def_file, ['Placeholder'],
+                                           ['add'])
     self.assertEqual('Please freeze the graph using freeze_graph.py',
                      str(error.exception))
 
@@ -373,8 +373,8 @@ class FromFlatbufferFile(test_util.TensorFlowTestCase):
     write_graph(sess.graph_def, '', graph_def_file, True)
 
     # Convert model and ensure model is not None.
-    converter = lite.TocoConverter.from_flatbuffer_file(
-        graph_def_file, ['Placeholder'], ['add'])
+    converter = lite.TocoConverter.from_frozen_graph(graph_def_file,
+                                                     ['Placeholder'], ['add'])
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
@@ -404,8 +404,8 @@ class FromFlatbufferFile(test_util.TensorFlowTestCase):
 
     # Attempts to convert the invalid model.
     with self.assertRaises(ValueError) as error:
-      lite.TocoConverter.from_flatbuffer_file(graph_def_file, ['Placeholder'],
-                                              ['add'])
+      lite.TocoConverter.from_frozen_graph(graph_def_file, ['Placeholder'],
+                                           ['add'])
     self.assertEqual(
         'Unable to parse input file \'{}\'.'.format(graph_def_file),
         str(error.exception))
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 38068bee08..337f05785e 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -70,7 +70,7 @@ def _get_toco_converter(flags):
 
   # Create TocoConverter.
   if flags.graph_def_file:
-    converter_fn = lite.TocoConverter.from_flatbuffer_file
+    converter_fn = lite.TocoConverter.from_frozen_graph
     converter_kwargs["graph_def_file"] = flags.graph_def_file
   elif flags.saved_model_dir:
     converter_fn = lite.TocoConverter.from_saved_model
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index e5f6a0b500..5071361bfd 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -87,7 +87,7 @@ graph_def_file = "/path/to/Downloads/mobilenet_v1_1.0_224/frozen_graph.pb"
 input_arrays = ["input"]
 output_arrays = ["MobilenetV1/Predictions/Softmax"]
 
-converter = tf.contrib.lite.TocoConverter.from_flatbuffer_file(
+converter = tf.contrib.lite.TocoConverter.from_frozen_graph(
   graph_def_file, input_arrays, output_arrays)
 tflite_model = converter.convert()
 open("converted_model.tflite", "wb").write(tflite_model)
-- 
GitLab


From 3e3dd647d17b5136d1afb8e4b5c1f39986684768 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Thu, 31 May 2018 16:15:45 -0700
Subject: [PATCH 130/610] [tf.data] Mark DebugString() as const.

By marking DebugString() as const we can make some error messages more descriptive. Because DatasetIterator marks the return value of the dataset() function const, DebugString() cannot be called.

PiperOrigin-RevId: 198796894
---
 tensorflow/contrib/data/kernels/csv_dataset_op.cc         | 2 +-
 .../data/kernels/directed_interleave_dataset_op.cc        | 2 +-
 .../contrib/data/kernels/ignore_errors_dataset_op.cc      | 4 +++-
 tensorflow/contrib/data/kernels/threadpool_dataset_op.cc  | 4 +++-
 tensorflow/contrib/data/kernels/unique_dataset_op.cc      | 2 +-
 tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc     | 2 +-
 tensorflow/core/framework/dataset.h                       | 2 +-
 tensorflow/core/kernels/data/batch_dataset_op.cc          | 2 +-
 tensorflow/core/kernels/data/cache_dataset_ops.cc         | 8 ++++++--
 tensorflow/core/kernels/data/concatenate_dataset_op.cc    | 4 +++-
 .../core/kernels/data/dense_to_sparse_batch_dataset_op.cc | 2 +-
 tensorflow/core/kernels/data/filter_dataset_op.cc         | 2 +-
 tensorflow/core/kernels/data/flat_map_dataset_op.cc       | 2 +-
 tensorflow/core/kernels/data/generator_dataset_op.cc      | 4 +++-
 .../core/kernels/data/group_by_reducer_dataset_op.cc      | 4 +++-
 .../core/kernels/data/group_by_window_dataset_op.cc       | 4 +++-
 tensorflow/core/kernels/data/interleave_dataset_op.cc     | 4 +++-
 tensorflow/core/kernels/data/map_and_batch_dataset_op.cc  | 4 +++-
 tensorflow/core/kernels/data/map_dataset_op.cc            | 2 +-
 tensorflow/core/kernels/data/padded_batch_dataset_op.cc   | 2 +-
 .../core/kernels/data/parallel_interleave_dataset_op.cc   | 2 +-
 tensorflow/core/kernels/data/parallel_map_dataset_op.cc   | 4 +++-
 tensorflow/core/kernels/data/prefetch_dataset_op.cc       | 2 +-
 tensorflow/core/kernels/data/random_dataset_op.cc         | 2 +-
 tensorflow/core/kernels/data/range_dataset_op.cc          | 2 +-
 tensorflow/core/kernels/data/reader_dataset_ops.cc        | 6 +++---
 tensorflow/core/kernels/data/repeat_dataset_op.cc         | 2 +-
 tensorflow/core/kernels/data/scan_dataset_op.cc           | 2 +-
 tensorflow/core/kernels/data/shuffle_dataset_op.cc        | 6 +++---
 tensorflow/core/kernels/data/skip_dataset_op.cc           | 2 +-
 tensorflow/core/kernels/data/slide_dataset_op.cc          | 2 +-
 .../core/kernels/data/sparse_tensor_slice_dataset_op.cc   | 2 +-
 tensorflow/core/kernels/data/sql_dataset_ops.cc           | 2 +-
 .../core/kernels/data/stats_aggregator_dataset_op.cc      | 2 +-
 tensorflow/core/kernels/data/stats_dataset_ops.cc         | 6 ++++--
 tensorflow/core/kernels/data/take_dataset_op.cc           | 2 +-
 tensorflow/core/kernels/data/tensor_dataset_op.cc         | 2 +-
 tensorflow/core/kernels/data/tensor_queue_dataset_op.cc   | 2 +-
 tensorflow/core/kernels/data/tensor_slice_dataset_op.cc   | 4 +++-
 tensorflow/core/kernels/data/unbatch_dataset_op.cc        | 2 +-
 tensorflow/core/kernels/data/window_dataset.cc            | 2 +-
 tensorflow/core/kernels/data/zip_dataset_op.cc            | 2 +-
 42 files changed, 74 insertions(+), 48 deletions(-)

diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index b16e66258b..97cc0bc6c9 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -145,7 +145,7 @@ class CSVDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "CSVDatasetOp::Dataset"; }
+    string DebugString() const override { return "CSVDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
index bdff379bfa..6a12ca06f4 100644
--- a/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/directed_interleave_dataset_op.cc
@@ -105,7 +105,7 @@ class DirectedInterleaveDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("DirectedInterleaveDatasetOp::Dataset");
     }
 
diff --git a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
index c3759b68d9..bbec50681c 100644
--- a/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/ignore_errors_dataset_op.cc
@@ -57,7 +57,9 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "IgnoreErrorsDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "IgnoreErrorsDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
index 7cf01f6a07..3dfc3741c2 100644
--- a/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/threadpool_dataset_op.cc
@@ -140,7 +140,9 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "ThreadPoolDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "ThreadPoolDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/contrib/data/kernels/unique_dataset_op.cc b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
index 652913d6b2..67c237799c 100644
--- a/tensorflow/contrib/data/kernels/unique_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/unique_dataset_op.cc
@@ -70,7 +70,7 @@ class UniqueDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("UniqueDatasetOp::Dataset");
     }
 
diff --git a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
index 7b08cfa095..2638b25ec4 100644
--- a/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
+++ b/tensorflow/contrib/kafka/kernels/kafka_dataset_ops.cc
@@ -81,7 +81,7 @@ class KafkaDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "KafkaDatasetOp::Dataset"; }
+    string DebugString() const override { return "KafkaDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 0f352ea559..23dc903caf 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -425,7 +425,7 @@ class DatasetBase : public core::RefCounted {
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
   // A human-readable debug string for this dataset.
-  virtual string DebugString() = 0;
+  virtual string DebugString() const = 0;
 
   // Serializes the dataset and writes it to the `writer`.
   virtual Status Save(OpKernelContext* ctx, IteratorStateWriter* writer) const {
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 9c0a6b02e8..9a83c16f33 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -75,7 +75,7 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 5f7db9ed12..3673df6fa3 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -83,7 +83,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "CacheDatasetOp::FileDataset"; }
+    string DebugString() const override {
+      return "CacheDatasetOp::FileDataset";
+    }
 
    private:
     static size_t StringPaddingSize(size_t num_tensors) {
@@ -295,7 +297,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "CacheDatasetOp::MemoryDataset"; }
+    string DebugString() const override {
+      return "CacheDatasetOp::MemoryDataset";
+    }
 
    private:
     // MemoryWriterIterator passes through and appends items from the input
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index 7c9dd1230a..0012a4769d 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -75,7 +75,9 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ConcatenateDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "ConcatenateDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
index 28fa77ce06..91b9279427 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
@@ -109,7 +109,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("DenseToSparseBatchDatasetOp(", batch_size_,
                              ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 5760e55e06..6d6c44552d 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -106,7 +106,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "FilterDatasetOp::Dataset"; }
+    string DebugString() const override { return "FilterDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index e2edda012a..baca022f1e 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -88,7 +88,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "FlatMapDatasetOp::Dataset"; }
+    string DebugString() const override { return "FlatMapDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index d298389f21..aae62ad2fe 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -112,7 +112,9 @@ class GeneratorDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "GeneratorDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "GeneratorDatasetOp::Dataset";
+    }
 
    private:
     class Iterator : public DatasetIterator<Dataset> {
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
index 7bbadffc48..03abae79d2 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
@@ -101,7 +101,9 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "GroupByReducerDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "GroupByReducerDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
index f9cc5d26b0..23d769e1ab 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
@@ -131,7 +131,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "GroupByWindowDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "GroupByWindowDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 723648b886..0765e63993 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -109,7 +109,9 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "InterleaveDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "InterleaveDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
index f55a66524a..703ef194a1 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
@@ -139,7 +139,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "MapAndBatchDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "MapAndBatchDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 40063c8ba9..aa530aea19 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -86,7 +86,7 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "MapDatasetOp::Dataset"; }
+    string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index f60b5472d6..d9e43ace39 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -133,7 +133,7 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("PaddedBatchDatasetOp(", batch_size_,
                              ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 8da6b331a3..6292b4536e 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -129,7 +129,7 @@ class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "ParallelInterleaveDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index cf55067e2c..3fa6b0d3a9 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -99,7 +99,9 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ParallelMapDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "ParallelMapDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 140983805a..e2b6aa590e 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -68,7 +68,7 @@ class PrefetchDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "PrefetchDatasetOp::Dataset"; }
+    string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/random_dataset_op.cc
index 40bd95e4e7..ff166c3be7 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/random_dataset_op.cc
@@ -71,7 +71,7 @@ class RandomDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("RandomDatasetOp(", seed_, ", ", seed2_,
                              ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index b18263b613..0b5c814767 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -65,7 +65,7 @@ class RangeDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("RangeDatasetOp(", start_, ", ", stop_, ", ",
                              step_, ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/reader_dataset_ops.cc b/tensorflow/core/kernels/data/reader_dataset_ops.cc
index 28d38d49eb..29654b9bca 100644
--- a/tensorflow/core/kernels/data/reader_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/reader_dataset_ops.cc
@@ -106,7 +106,7 @@ class TextLineDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "TextLineDatasetOp::Dataset"; }
+    string DebugString() const override { return "TextLineDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
@@ -340,7 +340,7 @@ class FixedLengthRecordDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "FixedLengthRecordDatasetOp::Dataset";
     }
 
@@ -560,7 +560,7 @@ class TFRecordDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "TFRecordDatasetOp::Dataset"; }
+    string DebugString() const override { return "TFRecordDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index fcd9820785..6b3f4ed27b 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -69,7 +69,7 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "RepeatDatasetOp::Dataset"; }
+    string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/scan_dataset_op.cc
index 972ed8fb00..a3b20016a8 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/scan_dataset_op.cc
@@ -103,7 +103,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ScanDatasetOp::Dataset"; }
+    string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index dad58efe73..6a51010fed 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -359,7 +359,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           parent_generator_(seed, seed2),
           generator_(&parent_generator_) {}
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::ReshufflingDataset");
     }
@@ -397,7 +397,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           seed_(seed),
           seed2_(seed) {}
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::FixedSeedDataset");
     }
@@ -480,7 +480,7 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
           seed_(seed),
           seed2_(seed2) {}
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("ShuffleAndRepeatDatasetOp(", buffer_size_, ", ",
                              seed_, ", ", seed2_, ", ", count_, ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 0177839707..b84afa3e33 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -65,7 +65,7 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "SkipDatasetOp::Dataset"; }
+    string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/slide_dataset_op.cc
index e4b2820445..48776cbf61 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/slide_dataset_op.cc
@@ -81,7 +81,7 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return strings::StrCat("SlideDatasetOp(", window_size_, ", ", stride_,
                              ")::Dataset");
     }
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index 4cc638b4cf..2604822cc9 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -50,7 +50,7 @@ class Dataset : public GraphDatasetBase {
     return shapes_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/sql_dataset_ops.cc
index 4742ed30cf..16652e792c 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/sql_dataset_ops.cc
@@ -102,7 +102,7 @@ class SqlDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "SqlDatasetOp::Dataset"; }
+    string DebugString() const override { return "SqlDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
index fd490c7c17..2ff90d7b10 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
@@ -66,7 +66,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "SetStatsAggregatorDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 8dc76185bc..7370a24b38 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -69,7 +69,9 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "LatencyStatsDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "LatencyStatsDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
@@ -166,7 +168,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override {
+    string DebugString() const override {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 209207d742..3d29221f3e 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -66,7 +66,7 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
-    string DebugString() override { return "TakeDatasetOp::Dataset"; }
+    string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 8f4586b5b6..36fc434d8f 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -64,7 +64,7 @@ class TensorDatasetOp : public DatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() override { return "TensorDatasetOp::Dataset"; }
+    string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
index e9f486d867..29b4c9053e 100644
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
@@ -94,7 +94,7 @@ class PrependFromQueueAndPaddedBatchDataset : public GraphDatasetBase {
     return batched_shapes_with_queue_;
   }
 
-  string DebugString() override {
+  string DebugString() const override {
     return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
   }
 
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index fd8780391c..68ce324081 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -81,7 +81,9 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() override { return "TensorSliceDatasetOp::Dataset"; }
+    string DebugString() const override {
+      return "TensorSliceDatasetOp::Dataset";
+    }
 
    protected:
     Status AsGraphDefInternal(DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
index 28f2350d6b..2aec9fb090 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/unbatch_dataset_op.cc
@@ -62,7 +62,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       return shapes_;
     }
 
-    string DebugString() override { return "UnbatchDatasetOp::Dataset"; }
+    string DebugString() const override { return "UnbatchDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index e7470f880f..668b461374 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -38,7 +38,7 @@ class WindowDataset : public DatasetBase {
     return output_shapes_;
   }
 
-  string DebugString() override { return "WindowDataset"; }
+  string DebugString() const override { return "WindowDataset"; }
 
  private:
   class Iterator : public DatasetIterator<WindowDataset> {
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index d5343cdf22..00705236f9 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -74,7 +74,7 @@ class ZipDatasetOp : public DatasetOpKernel {
       return output_shapes_;
     }
 
-    string DebugString() override { return "ZipDatasetOp::Dataset"; }
+    string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
    protected:
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
-- 
GitLab


From c4fff895cbb31eea0a9e2df0161aed5805c62dc6 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Thu, 31 May 2018 16:18:15 -0700
Subject: [PATCH 131/610] [tf.data] Reflect `MakeIterator` signature change in
 documentation.

PiperOrigin-RevId: 198797254
---
 tensorflow/docs_src/extend/new_data_formats.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 2c33a6b6f7..1a4309f373 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -45,7 +45,7 @@ Each of these implementations comprises three related classes:
 * A `tensorflow::GraphDatasetBase` subclass (e.g. `TextLineDatasetOp::Dataset`),
   which represents the *immutable* definition of the dataset itself, and tells
   TensorFlow how to construct an iterator object over that dataset, in its
-  `MakeIterator()` method.
+  `MakeIteratorInternal()` method.
 
 * A `tensorflow::DatasetIterator<Dataset>` subclass (e.g.
   `TextLineDatasetOp::Dataset::Iterator`), which represents the *mutable* state
@@ -103,7 +103,7 @@ class MyReaderDatasetOp : public DatasetOpKernel {
    public:
     Dataset(OpKernelContext* ctx) : GraphDatasetBase(ctx) {}
 
-    std::unique_ptr<IteratorBase> MakeIterator(
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
           new Iterator({this, strings::StrCat(prefix, "::MyReader")}));
-- 
GitLab


From 52dbe0647fbcd2c4abd5492e04414cc4169f688a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 16:21:24 -0700
Subject: [PATCH 132/610] Edited the landing page for the Performance section.
 Reorganized content and removed references to content that is being deleted.

PiperOrigin-RevId: 198797662
---
 tensorflow/docs_src/performance/benchmarks.md |  2 -
 tensorflow/docs_src/performance/index.md      | 39 +++++++++++--------
 tensorflow/docs_src/performance/leftnav_files |  1 -
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/tensorflow/docs_src/performance/benchmarks.md b/tensorflow/docs_src/performance/benchmarks.md
index 20165a090e..a5fa551dd4 100644
--- a/tensorflow/docs_src/performance/benchmarks.md
+++ b/tensorflow/docs_src/performance/benchmarks.md
@@ -403,8 +403,6 @@ GPUs | InceptionV3 (batch size 32) | ResNet-50 (batch size 32)
 This
 [script](https://github.com/tensorflow/benchmarks/tree/master/scripts/tf_cnn_benchmarks)
 was run on the various platforms to generate the above results.
-@{$performance_models$High-Performance Models} details techniques in the script
-along with examples of how to execute the script.
 
 In order to create results that are as repeatable as possible, each test was run
 5 times and then the times were averaged together. GPUs are run in their default
diff --git a/tensorflow/docs_src/performance/index.md b/tensorflow/docs_src/performance/index.md
index 49343eaac7..131d28fa3e 100644
--- a/tensorflow/docs_src/performance/index.md
+++ b/tensorflow/docs_src/performance/index.md
@@ -1,19 +1,31 @@
 # Performance
 
-Performance is often a significant issue when training a machine learning
-model.  This section explains various ways to optimize performance.  Start
-your investigation with the @{$performance_guide$Performance Guide} and then go
-deeper with techniques detailed in @{$performance_models$High-Performance Models}:
-
-  * @{$performance_guide$Performance Guide}, which contains a collection of best
+Performance is an important consideration when training machine learning
+models. Performance speeds up and scales research while
+also providing end users with near instant predictions. This section provides
+details on the high level APIs to use along with best practices to build
+and train high performance models, and quantize models for the least latency
+and highest throughput for inference.
+
+  * @{$performance_guide$Performance Guide} contains a collection of best
     practices for optimizing your TensorFlow code.
 
-  * @{$performance_models$High-Performance Models}, which contains a collection
-    of advanced techniques to build highly scalable models targeting different
-    system types and network topologies.
+  * @{$datasets_performance$Data input pipeline guide} describes the tf.data
+    API for building efficient data input pipelines for TensorFlow.
+
+  * @{$performance/benchmarks$Benchmarks} contains a collection of
+    benchmark results for a variety of hardware configurations.
+
+  * For improving inference efficiency on mobile and
+    embedded hardware, see
+    @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
+    explains how to use quantization to reduce model size, both in storage
+    and at runtime.
+
+  * For optimizing inference on GPUs, refer to [NVIDIA TensorRT™
+  integration with TensorFlow.](
+    https://medium.com/tensorflow/speed-up-tensorflow-inference-on-gpus-with-tensorrt-13b49f3db3fa)
 
-  * @{$performance/benchmarks$Benchmarks}, which contains a collection of
-    benchmark results.
 
 XLA (Accelerated Linear Algebra) is an experimental compiler for linear
 algebra that optimizes TensorFlow computations. The following guides explore
@@ -36,10 +48,5 @@ XLA:
     standalone tool that compiles TensorFlow graphs into executable code in
     order to optimize performance.
 
-And finally, we offer the following guide:
 
-  * @{$quantization$How to Quantize Neural Networks with TensorFlow}, which
-    can explains how to use quantization to reduce model size, both in storage
-    and at runtime. Quantization can improve performance, especially on
-    mobile hardware.
 
diff --git a/tensorflow/docs_src/performance/leftnav_files b/tensorflow/docs_src/performance/leftnav_files
index 1f894c39fe..12e0dbd48a 100644
--- a/tensorflow/docs_src/performance/leftnav_files
+++ b/tensorflow/docs_src/performance/leftnav_files
@@ -1,7 +1,6 @@
 index.md
 performance_guide.md
 datasets_performance.md
-performance_models.md
 benchmarks.md
 quantization.md
 
-- 
GitLab


From 18c67a44ace913d30dc573486dc792300a2cdad3 Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 31 May 2018 16:25:24 -0700
Subject: [PATCH 133/610] Handle FilterLayout::kOutputYXInput in
 FilterDescriptor::ToShortString.

This fixes an error when running resnet50_batch128_fp16 with --v=2.

PiperOrigin-RevId: 198798196
---
 tensorflow/stream_executor/dnn.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index eed93efc8d..5315d1f3da 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -407,6 +407,8 @@ string FilterDescriptor::ToShortString() const {
   switch (layout_) {
     case FilterLayout::kOutputInputYX:
       return port::StrCat(od, id, spatial);
+    case FilterLayout::kOutputYXInput:
+      return port::StrCat(od, spatial, id);
     case FilterLayout::kOutputInputYX4:
       return port::StrCat(od, id, spatial, "(VECT_C)");
     case FilterLayout::kInputYXOutput:
-- 
GitLab


From 6a6cfbfe4bd79fb0eb21b3d0753d3ddf6ee86ce8 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Thu, 31 May 2018 16:58:05 -0700
Subject: [PATCH 134/610] [XLA] Fix batchnorm rewriter to not use implicit
 broadcasts. Algebraic simplifier reshape change is now covered by
 ReshapeMover.

PiperOrigin-RevId: 198802494
---
 .../xla/service/algebraic_simplifier.cc       | 126 ++++----
 .../xla/service/algebraic_simplifier_test.cc  |  26 --
 .../xla/service/batchnorm_expander.cc         | 286 ++++++++++--------
 3 files changed, 222 insertions(+), 216 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index c65c91e8e0..e1a45e453e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -233,10 +233,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                    HloInstruction* operand, HloInstruction* max,
                                    HloInstruction* max_operand);
 
-  // A Reshape or Broadcast that feeds an element-wise operation with a unique
-  // non-scalar operand can sink to after the operation.
-  StatusOr<bool> TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
-      HloInstruction* reshape_or_broadcast);
+  // A Broadcast that feeds an element-wise operation with a unique non-scalar
+  // operand can sink to after the operation.
+  StatusOr<bool> TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
+      HloInstruction* broadcast);
 
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
@@ -1305,7 +1305,7 @@ Status AlgebraicSimplifierVisitor::HandleBroadcast(HloInstruction* broadcast) {
   // broadcast after the unary element-wise operation.
   TF_ASSIGN_OR_RETURN(
       bool sink_succeeded,
-      TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(broadcast));
+      TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(broadcast));
   changed_ |= sink_succeeded;
   if (sink_succeeded) {
     return Status::OK();
@@ -1557,15 +1557,16 @@ Status AlgebraicSimplifierVisitor::HandlePower(HloInstruction* power) {
   return Status::OK();
 }
 
-StatusOr<bool> AlgebraicSimplifierVisitor::
-    TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(
-        HloInstruction* reshape_or_broadcast) {
+StatusOr<bool>
+AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
+    HloInstruction* broadcast) {
+  TF_RET_CHECK(broadcast->opcode() == HloOpcode::kBroadcast);
   bool changed = false;
-  if (ShapeUtil::IsScalar(reshape_or_broadcast->shape())) {
+  if (ShapeUtil::IsScalar(broadcast->shape())) {
     return false;
   }
-  HloInstruction* operand = reshape_or_broadcast->mutable_operand(0);
-  for (HloInstruction* user : reshape_or_broadcast->users()) {
+  HloInstruction* operand = broadcast->mutable_operand(0);
+  for (HloInstruction* user : broadcast->users()) {
     if (user->user_count() == 0 && user != computation_->root_instruction()) {
       continue;
     }
@@ -1583,55 +1584,50 @@ StatusOr<bool> AlgebraicSimplifierVisitor::
       continue;
     }
 
-    int64 reshape_or_broadcast_operand_index = -1;
     // Find the unique non-scalar operand or continue if there isn't one.
-    int64 scalar_count = 0;
-    for (int64 i = 0; i < user->operand_count(); ++i) {
-      if (ShapeUtil::IsScalar(user->operand(i)->shape())) {
-        ++scalar_count;
-      } else {
-        reshape_or_broadcast_operand_index = i;
+    int64 scalar_broadcast_count = 0;
+    int64 broadcast_use_count = 0;
+    for (HloInstruction* user_operand : user->operands()) {
+      if (user_operand->opcode() == HloOpcode::kBroadcast &&
+          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
+        ++scalar_broadcast_count;
+      } else if (broadcast == user_operand) {
+        ++broadcast_use_count;
       }
     }
-    if (scalar_count != user->operand_count() - 1) {
+    if (scalar_broadcast_count + broadcast_use_count != user->operand_count()) {
       continue;
     }
-    VLOG(4) << "Sinking reshape or broadcast after user:";
-    VLOG(4) << "  old reshape/broadcast: " << reshape_or_broadcast->ToString();
+    std::vector<HloInstruction*> new_operands;
+    new_operands.reserve(user->operand_count());
+
+    for (HloInstruction* user_operand : user->operands()) {
+      if (user_operand->opcode() == HloOpcode::kBroadcast &&
+          ShapeUtil::IsScalar(user_operand->operand(0)->shape())) {
+        new_operands.push_back(
+            computation_->AddInstruction(HloInstruction::CreateBroadcast(
+                ShapeUtil::ChangeElementType(
+                    operand->shape(), user_operand->shape().element_type()),
+                user_operand->mutable_operand(0), {})));
+      } else {
+        CHECK_EQ(broadcast, user_operand);
+        new_operands.push_back(operand);
+      }
+    }
+    VLOG(4) << "Sinking broadcast after user:";
+    VLOG(4) << "  old broadcast: " << broadcast->ToString();
     VLOG(4) << "  old user: " << user->ToString();
-    CHECK_EQ(user->operand(reshape_or_broadcast_operand_index),
-             reshape_or_broadcast);
-    auto new_user_operands = user->operands();
-    new_user_operands[reshape_or_broadcast_operand_index] = operand;
-    auto new_user = computation_->AddInstruction(user->CloneWithNewOperands(
-        ShapeUtil::MakeShapeWithLayout(
-            user->shape().element_type(),
-            AsInt64Slice(operand->shape().dimensions()),
-            LayoutUtil::MinorToMajor(operand->shape())),
-        new_user_operands));
+    HloInstruction* new_user =
+        computation_->AddInstruction(user->CloneWithNewOperands(
+            ShapeUtil::ChangeElementType(operand->shape(),
+                                         user->shape().element_type()),
+            new_operands));
     VLOG(4) << "  new user: " << new_user->ToString();
-    HloInstruction* new_reshape_or_broadcast = nullptr;
-    if (reshape_or_broadcast->opcode() == HloOpcode::kReshape) {
-      new_reshape_or_broadcast =
-          computation_->AddInstruction(HloInstruction::CreateReshape(
-              ShapeUtil::MakeShapeWithLayout(
-                  user->shape().element_type(),
-                  AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
-              new_user));
-    } else {
-      TF_RET_CHECK(reshape_or_broadcast->opcode() == HloOpcode::kBroadcast);
-      new_reshape_or_broadcast =
-          computation_->AddInstruction(HloInstruction::CreateBroadcast(
-              ShapeUtil::MakeShapeWithLayout(
-                  user->shape().element_type(),
-                  AsInt64Slice(reshape_or_broadcast->shape().dimensions()),
-                  LayoutUtil::MinorToMajor(reshape_or_broadcast->shape())),
-              new_user, reshape_or_broadcast->dimensions()));
-    }
-    VLOG(4) << "  new reshape/broadcast: "
-            << new_reshape_or_broadcast->ToString();
-    TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(new_reshape_or_broadcast));
+    HloInstruction* new_broadcast =
+        computation_->AddInstruction(HloInstruction::CreateBroadcast(
+            user->shape(), new_user, broadcast->dimensions()));
+    VLOG(4) << "  new broadcast: " << new_broadcast->ToString();
+    TF_RETURN_IF_ERROR(user->ReplaceAllUsesWith(new_broadcast));
     changed = true;
   }
   return changed;
@@ -1674,16 +1670,6 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
     }
   }
 
-  // A Reshape that feeds a unary element-wise operation can sink the
-  // reshape after the unary element-wise operation.
-  TF_ASSIGN_OR_RETURN(
-      bool sink_succeeded,
-      TryToSinkReshapeOrBroadcastAfterOpWithUniqueNonScalarOperand(reshape));
-  changed_ |= sink_succeeded;
-  if (sink_succeeded) {
-    return Status::OK();
-  }
-
   // Make this a bitcast if possible.
   if (is_layout_sensitive_ &&
       ReshapeIsBitcast(reshape, valid_bitcast_callback_)) {
@@ -1788,6 +1774,11 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                     new_reduce_dimensions, function));
   }
 
+  if (ShapeUtil::ElementsIn(reduce->shape()) ==
+      ShapeUtil::ElementsIn(arg->shape())) {
+    return ReplaceWithNewInstruction(
+        reduce, HloInstruction::CreateReshape(reduce->shape(), arg));
+  }
   // A reshape that collapses multiple dimensions into a dimension being
   // reduced can just reduce all of those dimensions instead of doing a
   // collapsing reshape before a reduction.
@@ -1832,15 +1823,6 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                       new_reduce_dimensions, function));
     }
   }
-  if (ShapeUtil::ElementsIn(reduce->shape()) ==
-          ShapeUtil::ElementsIn(arg->shape()) ||
-      ShapeUtil::HasZeroElements(arg->shape())) {
-    auto reshape = computation_->AddInstruction(
-        HloInstruction::CreateReshape(reduce->shape(), arg));
-    return ReplaceWithNewInstruction(
-        reduce, HloInstruction::CreateMap(reduce->shape(),
-                                          {init_value, reshape}, function));
-  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index d5f0afe960..cda157f9fa 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1351,32 +1351,6 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
       op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
 }
 
-TEST_F(AlgebraicSimplifierTest, ReshapeAfterEffectiveUnary) {
-  HloComputation::Builder builder(TestName());
-  HloInstruction* param =
-      builder.AddInstruction(HloInstruction::CreateParameter(
-          0, ShapeUtil::MakeShape(F32, {2, 3, 4, 5}), "param"));
-  HloInstruction* movable_reshape =
-      builder.AddInstruction(HloInstruction::CreateReshape(
-          ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}), param));
-  HloInstruction* zero = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
-  builder.AddInstruction(
-      HloInstruction::CreateBinary(ShapeUtil::MakeShape(F32, {1, 2, 3, 4, 5}),
-                                   HloOpcode::kMaximum, movable_reshape, zero));
-  auto computation = module().AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Maximum(op::Reshape(param), zero));
-
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
-
-  simplifier.Run(&module()).ValueOrDie();
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Maximum(param, zero)));
-}
-
 // Regression test for a bug in the reshape sinking transformation, where
 // moving a reshape to a scalar led to a crash.
 TEST_F(AlgebraicSimplifierTest, ReshapeToScalarNotHoistedAfterEffectiveUnary) {
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 96e02b82b9..598718c72c 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -98,21 +98,67 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     return *scalar_add_computation;
   }
 
-  // Current HloComputation instance the BatchNormExpander is
-  // traversing.
-  HloComputation* computation_;
+  // TODO(b/80534766): Remove maps after performance issues with scalar
+  // broadcasts are resolved on all backends.
+  HloComputation* GetOrCreateScalarRsqrtComputation(
+      PrimitiveType primitive_type) {
+    HloComputation** scalar_rsqrt_computation =
+        &scalar_rsqrt_computations_[primitive_type];
+    if (*scalar_rsqrt_computation) {
+      return *scalar_rsqrt_computation;
+    }
 
-  bool rewrite_training_op_;
-  bool rewrite_inference_op_;
-  bool rewrite_grad_op_;
-  bool use_fusion_;
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateConvert(
+        shape, b.AddInstruction(HloInstruction::CreateConstant(
+                   Literal::CreateR0<float>(-0.5f)))));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kPower, scalar_lhs, scalar_rhs));
+    *scalar_rsqrt_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return *scalar_rsqrt_computation;
+  }
 
-  // Whether rewrite has occurred.
-  bool changed_ = false;
+  std::unique_ptr<HloInstruction> Rsqrt(HloInstruction* operand) {
+    return HloInstruction::CreateMap(
+        operand->shape(), {operand},
+        GetOrCreateScalarRsqrtComputation(operand->shape().element_type()));
+  }
 
-  // Cached computations for adding two scalars.
-  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
-      scalar_add_computations_;
+  HloComputation* GetOrCreateScalarMeanComputation(PrimitiveType primitive_type,
+                                                   int64 element_count) {
+    HloComputation** scalar_mean_computation =
+        &scalar_mean_computations_[std::pair<PrimitiveType, int64>(
+            primitive_type, element_count)];
+    if (*scalar_mean_computation) {
+      return *scalar_mean_computation;
+    }
+
+    HloComputation::Builder b("scalar_add_computation");
+    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
+    auto scalar_lhs = b.AddInstruction(
+        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
+    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateConvert(
+        shape, b.AddInstruction(
+                   HloInstruction::CreateConstant(Literal::CreateR0<float>(
+                       1.0f / static_cast<float>(element_count))))));
+    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
+        shape, HloOpcode::kMultiply, scalar_lhs, scalar_rhs));
+    *scalar_mean_computation =
+        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
+    return *scalar_mean_computation;
+  }
+
+  std::unique_ptr<HloInstruction> Mean(int64 element_count,
+                                       HloInstruction* operand) {
+    return HloInstruction::CreateMap(
+        operand->shape(), {operand},
+        GetOrCreateScalarMeanComputation(operand->shape().element_type(),
+                                         element_count));
+  }
 
   // Replaces the existing HLO instruction old_instruction, with
   // new_instruction, and marks the optimizer status as changed.
@@ -136,6 +182,25 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     changed_ = true;
     return Status::OK();
   }
+  // Current HloComputation instance the BatchNormExpander is
+  // traversing.
+  HloComputation* computation_;
+
+  bool rewrite_training_op_;
+  bool rewrite_inference_op_;
+  bool rewrite_grad_op_;
+  bool use_fusion_;
+
+  // Whether rewrite has occurred.
+  bool changed_ = false;
+
+  // Cached computations for adding two scalars.
+  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
+      scalar_add_computations_;
+  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
+      scalar_rsqrt_computations_;
+  tensorflow::gtl::FlatMap<std::pair<PrimitiveType, int64>, HloComputation*>
+      scalar_mean_computations_;
 };
 
 }  // namespace
@@ -167,6 +232,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     added_instructions.push_back(added_inst);
     return added_inst;
   };
+  auto add_binary = [&](const Shape& shape, const HloOpcode opcode,
+                        HloInstruction* a, HloInstruction* b) {
+    return add(HloInstruction::CreateBinary(shape, opcode, a, b));
+  };
   int64 instruction_count_before = computation_->instruction_count();
 
   // Expand batch norm training into smaller HLO ops.
@@ -176,12 +245,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   int64 feature_index = batch_norm->feature_index();
   const int64 feature_count = operand_shape.dimensions(feature_index);
   const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  auto elements_per_feature_literal =
-      Literal::CreateR0<float>(size_in_elements / feature_count);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal->Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+  int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -193,8 +257,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
 
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon =
-      add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon = add(HloInstruction::CreateBroadcast(
+      operand_shape,
+      add(HloInstruction::CreateConstant(std::move(epsilon_literal))), {}));
   std::vector<int64> dimensions_without_feature;
 
   for (int64 i = 0; i < ShapeUtil::Rank(operand_shape); ++i) {
@@ -213,8 +278,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       GetOrCreateScalarAddComputation(ptype);
 
   // X^2.
-  auto operand_squared = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kMultiply, operand, operand));
+  auto operand_squared =
+      add_binary(operand_shape, HloOpcode::kMultiply, operand, operand);
   // Sum[X].
   auto sum = add(HloInstruction::CreateReduce(feature_shape, operand, zero,
                                               dimensions_without_feature,
@@ -240,56 +305,47 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   }
 
   // E[X].
-  auto mean = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kDivide, sum, elements_per_feature));
+  auto mean = add(Mean(elements_per_feature_int64, sum));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kDivide, squared_sum, elements_per_feature));
+  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum));
 
   // E^2[X].
-  auto mean_square = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kMultiply, mean, mean));
+  auto mean_square =
+      add_binary(feature_shape, HloOpcode::kMultiply, mean, mean);
 
   // Var[X].
-  auto var = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kSubtract, square_mean, mean_square));
+  auto var =
+      add_binary(feature_shape, HloOpcode::kSubtract, square_mean, mean_square);
 
   auto var_broadcasted =
       add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
-
-  auto neg_half_literal = Literal::CreateR0(-0.5f);
-  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half =
-      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto var_add_epsilon =
+      add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon);
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon));
 
   // X - E[X].
-  auto operand_minus_mean = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract,
+                                       operand, mean_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = add(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
-                                   operand_minus_mean, rsqrt_var_add_epsilon));
+  auto normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                               operand_minus_mean, rsqrt_var_add_epsilon);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                                      normalized, scale_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
-  auto shifted_normalized = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kAdd, scaled_normalized, offset_broadcasted));
+  auto shifted_normalized = add_binary(operand_shape, HloOpcode::kAdd,
+                                       scaled_normalized, offset_broadcasted);
 
   auto tuple = HloInstruction::CreateTuple({shifted_normalized, mean, var});
 
@@ -331,8 +387,11 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
 
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon = computation_->AddInstruction(
-      HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon = computation_->AddInstruction(HloInstruction::CreateBroadcast(
+      operand_shape,
+      computation_->AddInstruction(
+          HloInstruction::CreateConstant(std::move(epsilon_literal))),
+      {}));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -349,6 +408,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
     added_instructions.push_back(added_inst);
     return added_inst;
   };
+  auto add_binary = [&](const Shape& shape, const HloOpcode opcode,
+                        HloInstruction* a, HloInstruction* b) {
+    return add(HloInstruction::CreateBinary(shape, opcode, a, b));
+  };
   int64 instruction_count_before = computation_->instruction_count();
 
   auto scale_broadcasted = add(
@@ -364,30 +427,23 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
       add(HloInstruction::CreateBroadcast(operand_shape, var, {feature_index}));
 
   // Var[X] + epsilon.
-  auto var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon));
-
-  auto neg_half_literal = Literal::CreateR0(-0.5f);
-  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half =
-      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
+  auto var_add_epsilon =
+      add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon);
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kPower, var_add_epsilon, neg_half));
+  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon));
 
   // X - E[X].
-  auto operand_minus_mean = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kSubtract, operand, mean_broadcasted));
+  auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract,
+                                       operand, mean_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon].
-  auto normalized = add(
-      HloInstruction::CreateBinary(operand_shape, HloOpcode::kMultiply,
-                                   operand_minus_mean, rsqrt_var_add_epsilon));
+  auto normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                               operand_minus_mean, rsqrt_var_add_epsilon);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale.
-  auto scaled_normalized = add(HloInstruction::CreateBinary(
-      operand_shape, HloOpcode::kMultiply, normalized, scale_broadcasted));
+  auto scaled_normalized = add_binary(operand_shape, HloOpcode::kMultiply,
+                                      normalized, scale_broadcasted);
 
   // (X - E[X]) / Sqrt[Var[X] + epsilon] * scale + offset.
   auto shifted_normalized = HloInstruction::CreateBinary(
@@ -435,6 +491,10 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
     added_instructions.push_back(added_inst);
     return added_inst;
   };
+  auto add_binary = [&](const Shape& shape, const HloOpcode opcode,
+                        HloInstruction* a, HloInstruction* b) {
+    return add(HloInstruction::CreateBinary(shape, opcode, a, b));
+  };
   int64 instruction_count_before = computation_->instruction_count();
 
   HloInstruction* activation = batch_norm->mutable_operand(0);
@@ -450,26 +510,20 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
   const int64 feature_count = activation_shape.dimensions(feature_index);
-  auto elements_per_feature_literal =
-      Literal::CreateR0<float>(size_in_elements / feature_count);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal->Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+  const int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   auto zero_literal = Literal::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal->Convert(ptype));
   auto zero = add(HloInstruction::CreateConstant(std::move(zero_literal)));
 
-  auto neg_half_literal = Literal::CreateR0(-0.5f);
-  TF_ASSIGN_OR_RETURN(neg_half_literal, neg_half_literal->Convert(ptype));
-  auto neg_half =
-      add(HloInstruction::CreateConstant(std::move(neg_half_literal)));
-
   auto epsilon_literal = Literal::CreateR0(batch_norm->epsilon());
   TF_ASSIGN_OR_RETURN(epsilon_literal, epsilon_literal->Convert(ptype));
-  auto epsilon =
+  auto epsilon_scalar =
       add(HloInstruction::CreateConstant(std::move(epsilon_literal)));
+  auto epsilon_activation = add(
+      HloInstruction::CreateBroadcast(activation_shape, epsilon_scalar, {}));
+  auto epsilon_feature =
+      add(HloInstruction::CreateBroadcast(feature_shape, epsilon_scalar, {}));
 
   std::vector<int64> dimensions_without_feature;
 
@@ -489,26 +543,21 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       HloInstruction::CreateBroadcast(activation_shape, mean, {feature_index}));
 
   // rsqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon_broadcasted = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kPower,
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                       variance_broadcasted, epsilon)),
-      neg_half));
-
-  auto rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kPower,
-      add(HloInstruction::CreateBinary(feature_shape, HloOpcode::kAdd, variance,
-                                       epsilon)),
-      neg_half));
+  auto rsqrt_var_add_epsilon_broadcasted =
+      add(Rsqrt(add_binary(activation_shape, HloOpcode::kAdd,
+                           variance_broadcasted, epsilon_activation)));
+
+  auto rsqrt_var_add_epsilon = add(Rsqrt(
+      add_binary(feature_shape, HloOpcode::kAdd, variance, epsilon_feature)));
 
   // X - E[X].
-  auto activation_minus_mean = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted));
+  auto activation_minus_mean = add_binary(
+      activation_shape, HloOpcode::kSubtract, activation, mean_broadcasted);
 
   // Grad[Y] * (X - E[X]).
   auto grad_output_times_activiation_minus_mean =
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                       grad_output, activation_minus_mean));
+      add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
+                 activation_minus_mean);
 
   HloComputation* add_reduce_computation =
       GetOrCreateScalarAddComputation(ptype);
@@ -540,9 +589,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
   }
 
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
-  auto grad_scale = add(HloInstruction::CreateBinary(
-      feature_shape, HloOpcode::kMultiply,
-      sum_grad_output_times_activiation_minus_mean, rsqrt_var_add_epsilon));
+  auto grad_scale = add_binary(feature_shape, HloOpcode::kMultiply,
+                               sum_grad_output_times_activiation_minus_mean,
+                               rsqrt_var_add_epsilon);
 
   // I2 = Sum(Grad[Y])
   auto i2 = add(HloInstruction::CreateBroadcast(activation_shape, grad_beta,
@@ -554,39 +603,40 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       {feature_index}));
 
   // I4 = (X - E[X]) * I3
-  auto i4 = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, i3, activation_minus_mean));
+  auto i4 = add_binary(activation_shape, HloOpcode::kMultiply, i3,
+                       activation_minus_mean);
 
   // I5 = I4 / (Var[X] + epsilon)
-  auto i5 = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, i4,
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kAdd,
-                                       variance_broadcasted, epsilon))));
+  auto i5 = add_binary(activation_shape, HloOpcode::kDivide, i4,
+                       add_binary(activation_shape, HloOpcode::kAdd,
+                                  variance_broadcasted, epsilon_activation));
 
   // scale * rsqrt[Var[X] + epsilon] * 1/N
-  auto scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kMultiply, scale_broadcasted,
-      rsqrt_var_add_epsilon_broadcasted));
+  auto scale_times_rsqrt_var_add_epsilon =
+      add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
+                 rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon = add(HloInstruction::CreateBinary(
-      activation_shape, HloOpcode::kDivide, scale_times_rsqrt_var_add_epsilon,
-      elements_per_feature));
+  scale_times_rsqrt_var_add_epsilon =
+      add(Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon));
 
-  auto i1 =
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                       grad_output, elements_per_feature));
+  auto elements_per_feature_literal =
+      Literal::CreateR0<float>(elements_per_feature_int64);
+  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
+                      elements_per_feature_literal->Convert(ptype));
+  auto elements_per_feature = add(
+      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
+  auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
+                       add(HloInstruction::CreateBroadcast(
+                           activation_shape, elements_per_feature, {})));
 
   // I6 = I1 - I2 - I5
-  auto i6 = add(HloInstruction::CreateBinary(
+  auto i6 = add_binary(
       activation_shape, HloOpcode::kSubtract,
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kSubtract,
-                                       i1, i2)),
-      i5));
+      add_binary(activation_shape, HloOpcode::kSubtract, i1, i2), i5);
 
   // Grad[X] = scale * rsqrt[Var[X] + epsilon] * 1/N * I6.
-  auto grad_activation =
-      add(HloInstruction::CreateBinary(activation_shape, HloOpcode::kMultiply,
-                                       scale_times_rsqrt_var_add_epsilon, i6));
+  auto grad_activation = add_binary(activation_shape, HloOpcode::kMultiply,
+                                    scale_times_rsqrt_var_add_epsilon, i6);
   auto tuple =
       HloInstruction::CreateTuple({grad_activation, grad_scale, grad_beta});
   if (batch_norm->has_sharding()) {
-- 
GitLab


From ba6d01807feaeaeb10272c9e55a7002306b63db5 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 31 May 2018 17:03:07 -0700
Subject: [PATCH 135/610] [TF:XLA] Preliminary support for tpu.replicate()
 inside of TF control flow (such as tf.while_loop()).

Register the remaining control-flow operators on XLA devices.

PiperOrigin-RevId: 198803131
---
 tensorflow/compiler/jit/xla_device_ops.h      | 11 ++-
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 92 ++++++++++++++++++-
 tensorflow/contrib/tpu/python/tpu/tpu_test.py |  4 +-
 tensorflow/core/kernels/control_flow_ops.cc   | 22 ++---
 tensorflow/core/kernels/control_flow_ops.h    | 16 ++++
 5 files changed, 122 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index b27c32e9bc..0c49286acd 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -95,7 +95,16 @@ class XlaAssignVariableOp : public AsyncOpKernel {
   REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
                           SwitchOp);                                           \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);
+      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
+  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
+  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
+  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
+                          NextIterationOp);                                    \
+  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("input")                             \
+                              .HostMemory("output"),                           \
+                          LoopCondOp);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 612cd0114b..4b777df6b9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -126,7 +126,19 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name, num_replicas):
+  def __init__(self, name, num_replicas, pivot):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
     super(TPUReplicateContext, self).__init__()
     self._num_replicas = num_replicas
     self._outer_device_function_stack = None
@@ -138,6 +150,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
+    self._pivot = pivot
 
   def report_unsupported_operations(self):
     if self._unsupported_ops:
@@ -262,9 +275,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
     super(TPUReplicateContext, self).Enter()
 
-  def Exit(self):
-    super(TPUReplicateContext, self).Exit()
-
   def HostComputeCore(self):
     return self._host_compute_core
 
@@ -300,10 +310,69 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       op.graph.prevent_feeding(op)
       op.graph.prevent_fetching(op)
 
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in xrange(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x != x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
   def AddValue(self, val):
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
     result = val
+    self._values.add(val.name)
     if self._outer_context:
       result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    result.op.graph.prevent_fetching(result.op)
+    # pylint: disable=protected-access
+    result.op._set_control_flow_context(self)
+    # pylint: enable=protected-access
+
+    self._external_values[val.name] = result
+
     return result
 
   def AddInnerOp(self, op):
@@ -319,6 +388,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     # grad_state should be as if this is the top-level gradient state.
     return None
 
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self):
+    return self._pivot
+
 
 def outside_compilation(computation, *args, **kwargs):
   """Builds part of a computation outside any current TPU replicate scope.
@@ -505,7 +584,9 @@ def split_compile_and_replicate(computation,
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
   cluster_name = graph.unique_name("cluster")
-  context = TPUReplicateContext(name=cluster_name, num_replicas=num_replicas)
+  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
+  context = TPUReplicateContext(
+      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
   try:
     context.Enter()
 
@@ -582,6 +663,7 @@ def split_compile_and_replicate(computation,
       with ops.device(t.device if t.device else core(0)):
         new_output_tensors.append(array_ops.identity(t))
     output_tensors = new_output_tensors
+    context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
     context.Exit()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index c3882b8a27..6bdaa528f9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 
@@ -37,7 +38,8 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context", 1)
+    pivot = control_flow_ops.no_op()
+    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 7d5d54e5be..ebf844d75f 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,24 +587,14 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-// A LoopCond op has one input and one output. The input is a boolean
-// scalar representing the taken branches of the "pivot" Switch that
-// determines loop termination. As a contract, any high-level front-end
-// should always use port '0' of the "pivot" switches for loop exit.
-class LoopCondOp : public OpKernel {
- public:
-  explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    context->set_output(0, context->input(0));
-  }
+LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
+LoopCondOp::~LoopCondOp() = default;
 
-  bool IsExpensive() override { return false; }
-
-  ~LoopCondOp() override {}
+void LoopCondOp::Compute(OpKernelContext* context) {
+  context->set_output(0, context->input(0));
+}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
-};
+bool LoopCondOp::IsExpensive() { return false; }
 
 REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp);
 REGISTER_KERNEL_BUILDER(Name("LoopCond")
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 4838f2e2bf..8edbcc9077 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,6 +97,22 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+  explicit LoopCondOp(OpKernelConstruction* context);
+  ~LoopCondOp() override;
+
+  void Compute(OpKernelContext* context) override;
+
+  bool IsExpensive() override;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
-- 
GitLab


From 217d73ceba3248c3570be72300a7234d2cef142b Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 31 May 2018 17:17:13 -0700
Subject: [PATCH 136/610] Mark tensorflow/contrib/learn:estimator_test as
 optonly because it is flaky due to timeouts without optimization.

PiperOrigin-RevId: 198804880
---
 tensorflow/contrib/learn/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 0fdbe8f630..b56a88659b 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -284,6 +284,7 @@ py_test(
     tags = [
         "manual",
         "noasan",  # times out
+        "optonly",  # test is flaky without optimization.
     ],
     deps = [
         ":learn",
-- 
GitLab


From 30faaee8154575f834050590ebe0bf6ff3f9c176 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 31 May 2018 17:18:54 -0700
Subject: [PATCH 137/610] [tf.data] Update `DatasetBase::DebugString()` to be
 const in the docs.

PiperOrigin-RevId: 198805143
---
 tensorflow/docs_src/extend/new_data_formats.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/extend/new_data_formats.md b/tensorflow/docs_src/extend/new_data_formats.md
index 1a4309f373..d1d1f69766 100644
--- a/tensorflow/docs_src/extend/new_data_formats.md
+++ b/tensorflow/docs_src/extend/new_data_formats.md
@@ -124,7 +124,7 @@ class MyReaderDatasetOp : public DatasetOpKernel {
       return *shapes;
     }
 
-    string DebugString() override { return "MyReaderDatasetOp::Dataset"; }
+    string DebugString() const override { return "MyReaderDatasetOp::Dataset"; }
 
    protected:
     // Optional: Implementation of `GraphDef` serialization for this dataset.
-- 
GitLab


From c3b62c38ebd73c98ffa5613865f4c01fa5ff6ae7 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 31 May 2018 17:19:25 -0700
Subject: [PATCH 138/610] [XLA] Fix handling of CustomCall's window and dnums.

CustomCall can have a window and convolution-dimension-numbers, so
HloInstruction needs to handle this in Clone() and Identical().

PiperOrigin-RevId: 198805211
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/hlo_instruction.cc   | 21 ++++++++
 .../xla/service/hlo_instruction_test.cc       | 50 +++++++++++++++++++
 3 files changed, 72 insertions(+)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index aa416312ad..2b14b63ea8 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -426,6 +426,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/compiler/xla/tools/parser:hlo_parser",
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a68075ef20..4095b3d337 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -1330,6 +1330,14 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       break;
     case HloOpcode::kCustomCall:
       clone = CreateCustomCall(shape, new_operands, custom_call_target_);
+      if (window_ != nullptr) {
+        clone->window_ = MakeUnique<Window>(*window_);
+      }
+      if (convolution_dimension_numbers_ != nullptr) {
+        clone->convolution_dimension_numbers_ =
+            MakeUnique<ConvolutionDimensionNumbers>(
+                *convolution_dimension_numbers_);
+      }
       break;
     case HloOpcode::kHostCompute:
       clone = CreateHostCompute(shape, new_operands, channel_name_,
@@ -1882,6 +1890,19 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kCustomCall:
+      if ((window_ == nullptr) != (other.window_ == nullptr) ||
+          (window_ != nullptr &&
+           !protobuf_util::ProtobufEquals(window(), other.window()))) {
+        return false;
+      }
+      if ((convolution_dimension_numbers_ == nullptr) !=
+              (other.convolution_dimension_numbers_ == nullptr) ||
+          (convolution_dimension_numbers_ != nullptr &&
+           !protobuf_util::ProtobufEquals(
+               convolution_dimension_numbers(),
+               other.convolution_dimension_numbers()))) {
+        return false;
+      }
       return custom_call_target_ == other.custom_call_target_;
     case HloOpcode::kReverse:
       return dimensions() == other.dimensions();
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index d1b6bc726d..a1a8814384 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/window_util.h"
 
 namespace xla {
 namespace {
@@ -1558,5 +1559,54 @@ TEST_F(HloInstructionTest, IdenticalAccountsForBackendConfig) {
   EXPECT_FALSE(add1->Identical(*add2));
 }
 
+TEST_F(HloInstructionTest, IdenticalAccountsForCustomCallWindow) {
+  auto instr1 = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                 /*operands=*/{},
+                                                 /*custom_call_target=*/"foo");
+  auto instr2 = instr1->Clone();
+  EXPECT_TRUE(instr1->Identical(*instr2));
+
+  Window w = window_util::MakeWindow({1, 2, 3});
+  instr1->set_window(w);
+  EXPECT_FALSE(instr1->Identical(*instr2));
+}
+
+TEST_F(HloInstructionTest, IdenticalAccountsForCustomCallDnums) {
+  auto instr1 = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                 /*operands=*/{},
+                                                 /*custom_call_target=*/"foo");
+  auto instr2 = instr1->Clone();
+  EXPECT_TRUE(instr1->Identical(*instr2));
+
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_output_batch_dimension(42);
+  instr1->set_convolution_dimension_numbers(dnums);
+  EXPECT_FALSE(instr1->Identical(*instr2));
+}
+
+TEST_F(HloInstructionTest, CloneWindowOnCustomCall) {
+  auto instr = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                /*operands=*/{},
+                                                /*custom_call_target=*/"foo");
+  Window w = window_util::MakeWindow({1, 2, 3});
+  instr->set_window(w);
+  auto clone = instr->Clone();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(clone->window(), w))
+      << clone->window().DebugString();
+}
+
+TEST_F(HloInstructionTest, CloneDnumsOnCustomCall) {
+  auto instr = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                                /*operands=*/{},
+                                                /*custom_call_target=*/"foo");
+  ConvolutionDimensionNumbers dnums;
+  dnums.set_output_batch_dimension(42);
+  instr->set_convolution_dimension_numbers(dnums);
+  auto clone = instr->Clone();
+  EXPECT_TRUE(protobuf_util::ProtobufEquals(
+      clone->convolution_dimension_numbers(), dnums))
+      << clone->convolution_dimension_numbers().DebugString();
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 179cc37f4212b403517d44053814dcb4570508b8 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 31 May 2018 17:20:31 -0700
Subject: [PATCH 139/610] Throw a more informative error message when
 checkpointing an input pipeline containing a ShuffleDataset with
 reshuffle_each_iteration=True. This is a temporary fix till we figure out how
 to handle this use-case.

PiperOrigin-RevId: 198805344
---
 .../core/kernels/data/shuffle_dataset_op.cc     | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 6a51010fed..3438199ebd 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -378,6 +378,23 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           iterator_seed2));
     }
 
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      return errors::Unimplemented(
+          "Checkpointing ShufflingDataset with reshuffle_each_iteration=true "
+          "is not supported.\n"
+          "If you have a ds.shuffle(buffer_size).repeat(count) in your input "
+          "pipeline, replace it with "
+          "ds.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count)).\n"
+          "If you iterate over your dataset once, change shuffle(buffer_size) "
+          "to shuffle(buffer_size, reshuffle_each_iteration=False).\n"
+          "If you are using Dataset.list_files(pattern), change it to "
+          "Dataset.list_files(pattern, shuffle=False) and manually shuffle "
+          "the list of files using shuffle_and_repeat as above or using "
+          "ds.shuffle with reshuffle_each_iteration=False.");
+    }
+
    private:
     const int64 seed_;
     const int64 seed2_;
-- 
GitLab


From c7c95eee2df578f222fd74cac36ec0ce5c16bec4 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 31 May 2018 18:09:50 -0700
Subject: [PATCH 140/610] Automated g4 rollback of changelist 198803131

PiperOrigin-RevId: 198810875
---
 tensorflow/compiler/jit/xla_device_ops.h      | 11 +--
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 92 +------------------
 tensorflow/contrib/tpu/python/tpu/tpu_test.py |  4 +-
 tensorflow/core/kernels/control_flow_ops.cc   | 22 +++--
 tensorflow/core/kernels/control_flow_ops.h    | 16 ----
 5 files changed, 23 insertions(+), 122 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 0c49286acd..b27c32e9bc 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -95,16 +95,7 @@ class XlaAssignVariableOp : public AsyncOpKernel {
   REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
                           SwitchOp);                                           \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
-  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
-  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
-  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
-                          NextIterationOp);                                    \
-  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
-                              .Device(DEVICE)                                  \
-                              .HostMemory("input")                             \
-                              .HostMemory("output"),                           \
-                          LoopCondOp);
+      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 4b777df6b9..612cd0114b 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -126,19 +126,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name, num_replicas, pivot):
-    """Builds a new TPUReplicateContext.
-
-    Args:
-      name: a unique name for the context, used to populate the `_tpu_replicate`
-        attribute.
-      num_replicas: an integer that gives the number of replicas for the
-        computation.
-      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
-        inputs will have a control dependency on the pivot node. This ensures
-        that nodes are correctly included in any enclosing control flow
-        contexts.
-    """
+  def __init__(self, name, num_replicas):
     super(TPUReplicateContext, self).__init__()
     self._num_replicas = num_replicas
     self._outer_device_function_stack = None
@@ -150,7 +138,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
-    self._pivot = pivot
 
   def report_unsupported_operations(self):
     if self._unsupported_ops:
@@ -275,6 +262,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
     super(TPUReplicateContext, self).Enter()
 
+  def Exit(self):
+    super(TPUReplicateContext, self).Exit()
+
   def HostComputeCore(self):
     return self._host_compute_core
 
@@ -310,69 +300,10 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       op.graph.prevent_feeding(op)
       op.graph.prevent_fetching(op)
 
-    # Remove any control edges from outer control flow contexts. These may cause
-    # mismatched frame errors.
-    control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
-
-    if not op.inputs:
-      # Add a control edge from the control pivot to this op.
-      if not control_inputs:
-        # pylint: disable=protected-access
-        op._add_control_input(self.GetControlPivot())
-        # pylint: enable=protected-access
-    else:
-      for index in xrange(len(op.inputs)):
-        x = op.inputs[index]
-        real_x = self.AddValue(x)
-        if real_x != x:
-          op._update_input(index, real_x)  # pylint: disable=protected-access
-
-    if external_inputs:
-      # Use an identity to pull control inputs as data inputs. Note that we
-      # ignore ops which don't have outputs. TODO(phawkins): fix that.
-      with ops.control_dependencies(None):
-        self.Enter()
-        external_inputs = [
-            array_ops.identity(x.outputs[0]).op
-            for x in external_inputs
-            if x.outputs
-        ]
-        self.Exit()
-      # pylint: disable=protected-access
-      op._add_control_inputs(external_inputs)
-      # pylint: enable=protected-access
-
-    # Mark op's outputs as seen by this context and any outer contexts.
-    output_names = [x.name for x in op.outputs]
-    context = self
-    while context is not None:
-      # pylint: disable=protected-access
-      context._values.update(output_names)
-      context = context._outer_context
-      # pylint: enable=protected-access
-
-    if self._outer_context:
-      self._outer_context.AddInnerOp(op)
-
   def AddValue(self, val):
-    if val.name in self._values:
-      # Use the real value if it comes from outer context.
-      result = self._external_values.get(val.name)
-      return val if result is None else result
-
     result = val
-    self._values.add(val.name)
     if self._outer_context:
       result = self._outer_context.AddValue(val)
-      self._values.add(result.name)
-
-    result.op.graph.prevent_fetching(result.op)
-    # pylint: disable=protected-access
-    result.op._set_control_flow_context(self)
-    # pylint: enable=protected-access
-
-    self._external_values[val.name] = result
-
     return result
 
   def AddInnerOp(self, op):
@@ -388,16 +319,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     # grad_state should be as if this is the top-level gradient state.
     return None
 
-  @property
-  def back_prop(self):
-    """Forwards to the enclosing while context, if any."""
-    if self.GetWhileContext():
-      return self.GetWhileContext().back_prop
-    return False
-
-  def GetControlPivot(self):
-    return self._pivot
-
 
 def outside_compilation(computation, *args, **kwargs):
   """Builds part of a computation outside any current TPU replicate scope.
@@ -584,9 +505,7 @@ def split_compile_and_replicate(computation,
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
   cluster_name = graph.unique_name("cluster")
-  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
-  context = TPUReplicateContext(
-      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
+  context = TPUReplicateContext(name=cluster_name, num_replicas=num_replicas)
   try:
     context.Enter()
 
@@ -663,7 +582,6 @@ def split_compile_and_replicate(computation,
       with ops.device(t.device if t.device else core(0)):
         new_output_tensors.append(array_ops.identity(t))
     output_tensors = new_output_tensors
-    context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
     context.Exit()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index 6bdaa528f9..c3882b8a27 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -26,7 +26,6 @@ from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 
@@ -38,8 +37,7 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    pivot = control_flow_ops.no_op()
-    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
+    context = tpu.TPUReplicateContext(b"context", 1)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index ebf844d75f..7d5d54e5be 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,14 +587,24 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
-LoopCondOp::~LoopCondOp() = default;
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+  explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 
-void LoopCondOp::Compute(OpKernelContext* context) {
-  context->set_output(0, context->input(0));
-}
+  void Compute(OpKernelContext* context) override {
+    context->set_output(0, context->input(0));
+  }
 
-bool LoopCondOp::IsExpensive() { return false; }
+  bool IsExpensive() override { return false; }
+
+  ~LoopCondOp() override {}
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+};
 
 REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp);
 REGISTER_KERNEL_BUILDER(Name("LoopCond")
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 8edbcc9077..4838f2e2bf 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,22 +97,6 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
-// A LoopCond op has one input and one output. The input is a boolean
-// scalar representing the taken branches of the "pivot" Switch that
-// determines loop termination. As a contract, any high-level front-end
-// should always use port '0' of the "pivot" switches for loop exit.
-class LoopCondOp : public OpKernel {
- public:
-  explicit LoopCondOp(OpKernelConstruction* context);
-  ~LoopCondOp() override;
-
-  void Compute(OpKernelContext* context) override;
-
-  bool IsExpensive() override;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
-};
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
-- 
GitLab


From 2e272dbca6600991599e55a7ff7cfa668b8403aa Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 31 May 2018 18:17:48 -0700
Subject: [PATCH 141/610] Make the TFOptimizer wrapper checkpointable.

TensorFlow Optimizers compiled with a Model will now have their state saved and restored with save_weights/load_weights.

PiperOrigin-RevId: 198811639
---
 tensorflow/python/keras/models_test.py | 21 +++++++++++++++++++++
 tensorflow/python/keras/optimizers.py  |  3 ++-
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index 01fb41b8ee..c616d8f24f 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
+from tensorflow.python.training import adam
 
 
 class TestModelCloning(test.TestCase):
@@ -123,5 +127,22 @@ class TestModelCloning(test.TestCase):
       keras.models._clone_sequential_model(seq_model, input_tensors=y)
 
 
+class CheckpointingTests(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_optimizer_dependency(self):
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(4,)))
+    opt = adam.AdamOptimizer(0.01)
+    model.compile(optimizer=opt, loss='mse')
+    model.fit(x=np.array([[1., 2., 3., 4.]]), y=[1.], epochs=2)
+    save_prefix = os.path.join(self.get_temp_dir(), 'ckpt')
+    beta1_power, _ = opt._get_beta_accumulators()
+    self.evaluate(beta1_power.assign(12.))
+    model.save_weights(save_prefix)
+    self.evaluate(beta1_power.assign(13.))
+    model.load_weights(save_prefix)
+    self.assertEqual(12., self.evaluate(beta1_power))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index febbda4df6..f58aeaea1a 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
+from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -718,7 +719,7 @@ class Nadam(Optimizer):
     return dict(list(base_config.items()) + list(config.items()))
 
 
-class TFOptimizer(Optimizer):
+class TFOptimizer(Optimizer, checkpointable.Checkpointable):
   """Wrapper class for native TensorFlow optimizers.
   """
 
-- 
GitLab


From 2f97b2f2796b2b1df781066b0efe443750ac5a6b Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Thu, 31 May 2018 18:28:20 -0700
Subject: [PATCH 142/610] [tf.data] Changed parsing logic for CsvDataset for
 better performance and correctness

PiperOrigin-RevId: 198812512
---
 .../contrib/data/kernels/csv_dataset_op.cc    | 542 +++++++++++++-----
 .../contrib/data/python/kernel_tests/BUILD    |   1 +
 .../kernel_tests/csv_dataset_op_test.py       | 292 ++++++++--
 tensorflow/core/lib/strings/numbers.cc        |  26 +
 tensorflow/core/lib/strings/numbers.h         |   2 +
 5 files changed, 660 insertions(+), 203 deletions(-)

diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index 97cc0bc6c9..e88ad3dc32 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 
 namespace tensorflow {
@@ -103,12 +102,11 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(
         ctx, select_cols.empty() || select_cols.front() >= 0,
         errors::InvalidArgument("select_cols should be non-negative indices"));
-    bool select_all_cols = select_cols.empty();
 
-    *output = new Dataset(
-        ctx, std::move(filenames), header, buffer_size, output_types_,
-        output_shapes_, std::move(record_defaults), std::move(select_cols),
-        select_all_cols, use_quote_delim, delim[0], std::move(na_value));
+    *output = new Dataset(ctx, std::move(filenames), header, buffer_size,
+                          output_types_, output_shapes_,
+                          std::move(record_defaults), std::move(select_cols),
+                          use_quote_delim, delim[0], std::move(na_value));
   }
 
  private:
@@ -118,8 +116,7 @@ class CSVDatasetOp : public DatasetOpKernel {
             int64 buffer_size, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
-            bool select_all_cols, bool use_quote_delim, char delim,
-            string na_value)
+            bool use_quote_delim, char delim, string na_value)
         : GraphDatasetBase(ctx),
           filenames_(std::move(filenames)),
           header_(header),
@@ -128,7 +125,6 @@ class CSVDatasetOp : public DatasetOpKernel {
           output_shapes_(output_shapes),
           record_defaults_(std::move(record_defaults)),
           select_cols_(std::move(select_cols)),
-          select_all_cols_(select_all_cols),
           use_quote_delim_(use_quote_delim),
           delim_(delim),
           na_value_(std::move(na_value)) {}
@@ -166,11 +162,24 @@ class CSVDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
+        bool select_all = dataset()->select_cols_.empty();
         do {
           // We are currently processing a file, so try to read the next record
-          if (buffered_input_stream_) {
-            Status s = ReadRecord(ctx, out_tensors);
-            if (s.ok() || !errors::IsOutOfRange(s)) {
+          if (input_stream_) {
+            Status s = ReadRecord(ctx, out_tensors, select_all,
+                                  dataset()->select_cols_);
+            if (s.ok()) {
+              // Validate output
+              if (out_tensors->size() != dataset()->out_type_.size()) {
+                return errors::InvalidArgument(
+                    "Expect ", dataset()->out_type_.size(), " fields but have ",
+                    out_tensors->size(), " in record");
+              }
+
+              *end_of_sequence = false;
+              return s;
+            }
+            if (!errors::IsOutOfRange(s)) {
               // Not at the end of file, return OK or non-EOF errors to caller.
               *end_of_sequence = false;
               return s;
@@ -203,145 +212,341 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
      private:
-      // Reads a record by parsing the input buffer, and converting extracted
+      // Reads an entire CSV row from the input stream, either from the
+      // existing buffer or by filling the buffer as needed. Converts extracted
       // fields to output tensors as we go.
-      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors)
+      //
+      // When this function is called, pos_ should be the index of the first
+      // character of the record in buffer_, or past the end of the buffer.
+      // Note: ctx and out_tensors are only used in this function
+      // when fields are included in the record.
+      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                        bool select_all, const std::vector<int64>& selected)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Extracts fields from line(s) from the buffered input stream.
-        out_tensors->reserve(dataset()->record_defaults_.size());
-
-        string input;
-        TF_RETURN_IF_ERROR(buffered_input_stream_->ReadLine(&input));
-
-        size_t current_idx = 0;
-        size_t num_fields_parsed = 0;
-        size_t selector_idx = 0;  // Keep track of index into select_cols
-
-        while (current_idx < input.size()) {
-          // In each iteration, parse one field
-          if (input[current_idx] == '\n' || input[current_idx] == '\r') {
-            // This should never happen, because buffered input reader splits
-            // input on newlines.
-            return errors::InvalidArgument("Parsing error.");
-          }
+        if (pos_ >= buffer_.size()) {
+          // At the end of the file, this will return errors::OutOfRange
+          TF_RETURN_IF_ERROR(FillBuffer(&buffer_));
+          pos_ = 0;
+        }
+
+        // The first character may be \n if this is the continuation of a
+        // \r\n linebreak between this and the previous record. If so, skip it.
+
+        bool end_of_record = false;  // Keep track of when we find \n, \r or EOF
+        size_t num_parsed = 0;
+        size_t num_selected_parsed = 0;
 
-          bool quoted = false;
+        Status result = Status::OK();
+
+        while (!end_of_record) {  // Read till we reach \n, \r or EOF
           bool include =
-              (dataset()->select_all_cols_ ||
-               dataset()->select_cols_[selector_idx] == num_fields_parsed);
+              select_all || (num_selected_parsed < selected.size() &&
+                             selected[num_selected_parsed] == num_parsed);
+
+          // Don't fail fast, so that the next call to GetNext may still return
+          // a valid record
+          result.Update(
+              ParseOneField(ctx, out_tensors, &end_of_record, include));
 
-          if (dataset()->use_quote_delim_ && input[current_idx] == '"') {
-            quoted = true;
-            current_idx++;
+          num_parsed++;
+          if (include) num_selected_parsed++;
+        }
+
+        return result;
+      }
+
+      // Parses one field from position pos_ in the buffer. Fields are
+      // delimited by delim, CRLF, or EOF. Advances pos_ to the first char of
+      // the next field.
+      Status ParseOneField(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          // If we get here, this means the previous field's end coincided
+          // with the end of the buffer. We can fill the buffer without abandon.
+          Status s = FillBuffer(&buffer_);
+
+          if (errors::IsOutOfRange(s)) {
+            // Reached EOF, and last field is empty
+            *end_of_record = true;
+            if (include) {
+              return FieldToOutput(ctx, StringPiece(), out_tensors);
+            } else {
+              return Status::OK();
+            }
+          } else if (!s.ok()) {
+            return s;  // Surface other errors back to caller
           }
 
-          // Parse the body of the field
-          string field;
-          if (!quoted) {
-            while (current_idx < input.size() &&
-                   input[current_idx] != dataset()->delim_) {
-              if ((dataset()->use_quote_delim_ && input[current_idx] == '"') ||
-                  input[current_idx] == '\n' || input[current_idx] == '\r') {
-                return errors::InvalidArgument(
-                    "Unquoted fields cannot have quotes/CRLFs inside");
+          pos_ = 0;
+        }
+
+        if (dataset()->use_quote_delim_ && buffer_[pos_] == '"') {
+          return ParseQuotedField(ctx, out_tensors, end_of_record, include);
+        }
+
+        return ParseUnquotedField(ctx, out_tensors, end_of_record, include);
+      }
+
+      // For keeping track of relevant parts of a field from a previous buffer
+      struct Piece {
+        size_t start;
+        size_t len;
+        string buffer;
+
+        Piece(string buffer, size_t start, size_t len)
+            : start(start), len(len), buffer(std::move(buffer)) {}
+      };
+
+      // Given that pos_ exceeds the buffer, saves the relevant part of the
+      // current buffer (if necessary), fills the buffer, and resets indices to
+      // 0.
+      Status SaveAndFillBuffer(std::vector<Piece>* earlier_pieces,
+                               size_t* start, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        string temp_buffer;
+
+        buffer_.swap(temp_buffer);
+        if (include && pos_ > *start) {
+          earlier_pieces->push_back(
+              Piece(std::move(temp_buffer), *start, pos_ - *start));
+        }
+        pos_ = 0;
+        *start = 0;
+        return FillBuffer(&buffer_);
+      }
+
+      // Parses unquoted field from position pos_ in the buffer. Continually
+      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
+      // Advances pos_ to keep track of our position in the buffer as we go,
+      // stopping at the first character of the next field.
+      Status ParseQuotedField(IteratorContext* ctx,
+                              std::vector<Tensor>* out_tensors,
+                              bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<Piece> earlier_pieces;
+        size_t start = pos_;
+        pos_++;  // Starting quotation mark
+
+        while (true) {  // Each iter reads 1 char, filling buffer if necessary
+          if (pos_ >= buffer_.size()) {
+            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+            if (errors::IsOutOfRange(s)) {
+              return errors::InvalidArgument(
+                  "Reached end of file without closing quoted field in "
+                  "record");
+            } else if (!s.ok()) {
+              return s;  // Surface all other errors to caller
+            }
+          }
+
+          char ch = buffer_[pos_];
+          if (ch == '"') {
+            // When we encounter a quote, we look ahead to the next character to
+            // decide what to do
+            pos_++;
+            if (pos_ >= buffer_.size()) {
+              Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+              if (errors::IsOutOfRange(s)) {
+                // This was the last field. We are done
+                *end_of_record = true;
+                return QuotedFieldToOutput(ctx, StringPiece(), out_tensors,
+                                           earlier_pieces, include);
+              } else if (!s.ok()) {
+                return s;
               }
-              if (include) field += input[current_idx];
-              current_idx++;
-            }  // Exit condition: end of input, or current index at delim
+            }
+
+            char next = buffer_[pos_];
+            pos_++;
+            if (next == dataset()->delim_) {
+              return QuotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  out_tensors, earlier_pieces, include);
+
+            } else if (next == '\n' || next == '\r') {
+              *end_of_record = true;
+              Status s = QuotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  out_tensors, earlier_pieces, include);
+              if (next == '\r') SkipNewLineIfNecessary();
+              return s;
+            } else if (next != '"') {
+              return errors::InvalidArgument(
+                  "Quote inside a string has to be escaped by another quote");
+            }
 
-            // Go to next field or the end
-            current_idx++;
           } else {
-            // Quoted field needs to be ended with '"' and delim or end
-            while (true) {
-              if (current_idx >= input.size() - 1 || input.empty()) {
-                if (current_idx == input.size() - 1 &&
-                    input[current_idx] == '"') {
-                  // We're at the end of the input, and the quote terminates the
-                  // record. Go to end.
-                  current_idx++;
-                  break;
-                }
-                // If there's no terminating quote, it means our buffered record
-                // line reader split a record up. This can happen if there is a
-                // newline encased in quotes. The next line is also part of the
-                // record, so we read it and reset the index.
-                if (include && current_idx == input.size() - 1) {
-                  // TODO(rachelim): Instead of building up a string, keep track
-                  //  of terminal indices (or starting char* and length)
-                  // Also look into using /lib/strings/Scanner
-                  field += input[current_idx];
-                }
-                if (include) {
-                  field += '\n';
-                }
-                current_idx = 0;
-                Status s = buffered_input_stream_->ReadLine(&input);
-                if (!s.ok()) {
-                  return errors::InvalidArgument(
-                      "Quoted field has to end with quote followed by delim, "
-                      "CRLF, or EOF");
-                }
-              } else if (input[current_idx] == '"' &&
-                         input[current_idx + 1] == dataset()->delim_) {
-                // End of field, go to next field or end
-                current_idx += 2;
-                break;
-              } else if (input[current_idx] == '"') {
-                // Current char is a quote. Since we're not at end of field,
-                // the next character must also be a quote.
-                if (input[current_idx + 1] != '"') {
-                  return errors::InvalidArgument(
-                      "Quote inside a string has to be escaped by another "
-                      "quote");
-                }
-                if (include) field += '"';
-                current_idx += 2;
-              } else {
-                if (include) field += input[current_idx];
-                current_idx++;
-              }
+            pos_++;
+          }
+        }
+      }
+
+      // Converts quoted field to an output tensor, removing the starting
+      // and ending quotes from it and unescaping double quotations if
+      // necessary.
+      Status QuotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+                                 std::vector<Tensor>* out_tensors,
+                                 const std::vector<Piece>& earlier_pieces,
+                                 bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!include) return Status::OK();
+
+        if (earlier_pieces.empty()) {
+          if (field.find('\"', 1) == field.size() - 1) {
+            // `field` contains no escaped quotation marks.
+            // Exclude framing quotation marks
+            field.remove_prefix(1);
+            field.remove_suffix(1);
+            return FieldToOutput(ctx, field, out_tensors);
+          }
+        }
+        string field_complete;
+        size_t str_len = field.size();
+        for (const Piece& p : earlier_pieces) {
+          str_len += p.len;
+        }
+        field_complete.reserve(str_len);
+
+        // This bool flips every time we see a quote, so that we skip the second
+        // quote of every pair of adjacent quotes in the field. We need to track
+        // this across iterations of the for loop because adjacent double quotes
+        // may be in different buffers. Initialize to true because we also skip
+        // the opening quotation mark of the quoted field.
+        bool skip_next_quote = true;
+        for (const Piece& p : earlier_pieces) {
+          AppendUnescapedPiece(StringPiece(&p.buffer[p.start], p.len),
+                               &field_complete, &skip_next_quote);
+        }
+        AppendUnescapedPiece(field, &field_complete, &skip_next_quote);
+        StringPiece result = StringPiece(field_complete);
+        result.remove_suffix(1);  // Skip final quote
+
+        return FieldToOutput(ctx, result, out_tensors);
+      }
+
+      void AppendUnescapedPiece(StringPiece piece, string* field_complete,
+                                bool* skip_next_quote) {
+        size_t from = 0;
+        size_t found = piece.find('\"', from);
+        while (found != string::npos) {
+          if (!*skip_next_quote) {
+            // This is the first quote in a pair of adjacent double quotes
+            field_complete->append(piece.data() + from, found + 1 - from);
+          }
+          *skip_next_quote = !*skip_next_quote;
+          from = found + 1;
+          found = piece.find('\"', from);
+        }
+        // Include the chunk after the last quotation mark in the string
+        if (from < piece.size()) {
+          field_complete->append(piece.data() + from, piece.size() - from);
+        }
+      }
+
+      // Parses unquoted field from position pos_ in the buffer. Continually
+      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
+      // Advances pos_ to keep track of our position in the buffer as we go,
+      // stopping at the first character of the next field.
+      Status ParseUnquotedField(IteratorContext* ctx,
+                                std::vector<Tensor>* out_tensors,
+                                bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<Piece> earlier_pieces;
+        size_t start = pos_;
+        while (true) {  // Each iter reads 1 char, filling buffer if necessary
+          if (pos_ >= buffer_.size()) {
+            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+            // Handle errors
+            if (errors::IsOutOfRange(s)) {
+              // Whatever we have is the last field of the last record
+              *end_of_record = true;
+              return UnquotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                  earlier_pieces, include);
+            } else if (!s.ok()) {
+              return s;  // Surface all other errors to caller
             }
           }
 
-          num_fields_parsed++;
+          char ch = buffer_[pos_];
 
-          if (include) {
-            // Add the tensor to the result
-            TF_RETURN_IF_ERROR(FieldToOutput(ctx, std::move(field),
-                                             selector_idx, out_tensors));
-            selector_idx++;
-            // Terminate early if we have all the fields we want
-            if (selector_idx == dataset()->select_cols_.size())
-              return Status::OK();
+          if (ch == dataset()->delim_) {
+            Status s = UnquotedFieldToOutput(
+                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                earlier_pieces, include);
+            pos_++;
+            return s;
+          }
+          if (ch == '\n' || ch == '\r') {
+            // need special case to skip over first \n of record if the line
+            // breaks are \r\n
+            Status s = UnquotedFieldToOutput(
+                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                earlier_pieces, include);
+            *end_of_record = true;
+            pos_++;
+            if (ch == '\r') SkipNewLineIfNecessary();
+            return s;
           }
-        }  // Exit condition: current_idx has reached the end of record
-
-        // Check if the last field is empty, and include it if necessary
-        bool include =
-            (dataset()->select_all_cols_ ||
-             dataset()->select_cols_[selector_idx] == num_fields_parsed);
-        if (include && !input.empty() &&
-            input[input.size() - 1] == dataset()->delim_) {
-          TF_RETURN_IF_ERROR(
-              FieldToOutput(ctx, string(), selector_idx, out_tensors));
+          if (dataset()->use_quote_delim_ && ch == '"') {
+            // Advance pos_ to the next field anyway so that we can ignore
+            // errors gracefully if required. The caller of this will be able to
+            // call ParseOneField and continue with the rest of the record.
+            AdvanceToNextField(end_of_record);
+            return errors::InvalidArgument(
+                "Unquoted fields cannot have quotes inside");
+          }
+          // Otherwise, go to next character
+          pos_++;
         }
+      }
 
-        // Check that number of fields matches
-        if (out_tensors->size() != dataset()->out_type_.size()) {
-          return errors::InvalidArgument("Expect ", dataset()->out_type_.size(),
-                                         " fields but have ",
-                                         out_tensors->size(), " in record");
+      // Advances pos_ to the start of the next field, as delimited by delim,
+      // CRLF, or EOF, ignoring errors, and not keeping track of characters in
+      // the current field.
+      void AdvanceToNextField(bool* end_of_record)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        while (true) {
+          if (pos_ >= buffer_.size()) {
+            Status s = FillBuffer(&buffer_);
+            pos_ = 0;
+            if (!s.ok()) {
+              *end_of_record = true;
+              return;
+            }
+          }
+
+          char ch = buffer_[pos_];
+          pos_++;
+
+          if (ch == dataset()->delim_) {
+            return;
+          }
+
+          if (ch == '\n' || ch == '\r') {
+            *end_of_record = true;
+            if (ch == '\r') SkipNewLineIfNecessary();
+            return;
+          }
         }
-        return Status::OK();
       }
 
-      // Given a string field, and its index in the output,
-      // converts it to a Tensor of the right type and adds it to the
-      // out_tensors vector.
-      Status FieldToOutput(IteratorContext* ctx, string field,
-                           size_t output_idx,
+      Status FillBuffer(string* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        result->clear();
+        Status s = input_stream_->ReadNBytes(dataset()->buffer_size_, result);
+
+        if (errors::IsOutOfRange(s) && !result->empty()) {
+          // Ignore OutOfRange error when ReadNBytes read < N bytes.
+          return Status::OK();
+        }
+        return s;
+      }
+
+      // Given a field, converts it to the right output tensor type
+      Status FieldToOutput(IteratorContext* ctx, StringPiece field,
                            std::vector<Tensor>* out_tensors) {
+        size_t output_idx = out_tensors->size();
         if (output_idx >= dataset()->out_type_.size()) {
           // We can get here if we're selecting all columns, but the number of
           // fields exceeds the number of defaults provided
@@ -397,7 +602,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                   dataset()->record_defaults_[output_idx].flat<float>()(0);
             } else {
               float value;
-              if (!strings::safe_strtof(field.c_str(), &value)) {
+              if (!strings::safe_strtof(field, &value)) {
                 return errors::InvalidArgument(
                     "Field ", output_idx,
                     " in record is not a valid float: ", field);
@@ -412,7 +617,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                   dataset()->record_defaults_[output_idx].flat<double>()(0);
             } else {
               double value;
-              if (!strings::safe_strtod(field.c_str(), &value)) {
+              if (!strings::safe_strtod(field, &value)) {
                 return errors::InvalidArgument(
                     "Field ", output_idx,
                     " in record is not a valid double: ", field);
@@ -426,7 +631,7 @@ class CSVDatasetOp : public DatasetOpKernel {
               component.scalar<string>()() =
                   dataset()->record_defaults_[output_idx].flat<string>()(0);
             } else {
-              component.scalar<string>()() = std::move(field);
+              component.scalar<string>()() = field.ToString();
             }
             break;
           }
@@ -439,6 +644,50 @@ class CSVDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
+      // Records can be delimited by "\r\n" line breaks. When we encounter a
+      // '\r', we have to check the next character to see if it is part of the
+      // linebreak, and ignore it if so.
+      void SkipNewLineIfNecessary() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          Status s = FillBuffer(&buffer_);
+          pos_ = 0;
+          // If we failed to fill buffer, it doesn't matter because we're done
+          // with the record
+          if (!s.ok()) return;
+        }
+        if (buffer_[pos_] == '\n') {
+          pos_++;
+        }
+      }
+
+      // Given a string field, and its index in the output,
+      // converts it to a Tensor of the right type and adds it to the
+      // out_tensors vector.
+      Status UnquotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+                                   std::vector<Tensor>* out_tensors,
+                                   const std::vector<Piece>& earlier_pieces,
+                                   bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!include) return Status::OK();
+
+        if (earlier_pieces.empty()) {
+          return FieldToOutput(ctx, field, out_tensors);
+        }
+
+        size_t str_len = field.size();
+        for (const Piece& p : earlier_pieces) {
+          str_len += p.len;
+        }
+        string field_complete;
+        field_complete.reserve(str_len);
+
+        for (const Piece& p : earlier_pieces) {
+          field_complete.append(p.buffer, p.start, p.len);
+        }
+
+        field_complete.append(field.data(), field.size());
+        return FieldToOutput(ctx, field_complete, out_tensors);
+      }
+
       // Sets up reader streams to read from the file at `current_file_index_`.
       Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (current_file_index_ >= dataset()->filenames_.size()) {
@@ -452,16 +701,18 @@ class CSVDatasetOp : public DatasetOpKernel {
             dataset()->filenames_[current_file_index_], &file_));
         input_stream_.reset(
             new io::RandomAccessInputStream(file_.get(), false));
-        // TODO(rachelim): Maintain our own buffer so we don't read every record
-        //   twice
-        buffered_input_stream_.reset(new io::BufferedInputStream(
-            input_stream_.get(), dataset()->buffer_size_, false));
+        buffer_.clear();
+        pos_ = 0;
         if (dataset()->header_) {
-          // Ignore header line
-          string str;
-          Status s = buffered_input_stream_->ReadLine(&str);
-          if (errors::IsOutOfRange(s)) {
-            return errors::InvalidArgument("Can't read header of empty file");
+          // Read one line, but don't include it. Pass nullptrs as dummy
+          // pointers to objects that shouldn't be invoked anyway
+          // We need to process this as a record here instead of just finding
+          // the first newline because it might contain quoted fields with
+          // newlines in the header as well
+          std::vector<int64> empty;
+          Status s = ReadRecord(nullptr, nullptr, false, empty);
+          if (!s.ok()) {
+            return errors::InvalidArgument("Can't read header of file");
           }
         }
         return Status::OK();
@@ -470,15 +721,15 @@ class CSVDatasetOp : public DatasetOpKernel {
       // Resets all reader streams.
       void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         input_stream_.reset();
-        buffered_input_stream_.reset();
         file_.reset();
       }
 
       mutex mu_;
+      string buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
+      size_t pos_ GUARDED_BY(
+          mu_);  // Index into the buffer must be maintained between iters
       std::unique_ptr<io::RandomAccessInputStream> input_stream_
           GUARDED_BY(mu_);
-      std::unique_ptr<io::BufferedInputStream> buffered_input_stream_
-          GUARDED_BY(mu_);
       size_t current_file_index_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<RandomAccessFile> file_
           GUARDED_BY(mu_);  // must outlive input_stream_
@@ -491,7 +742,6 @@ class CSVDatasetOp : public DatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const std::vector<Tensor> record_defaults_;
     const std::vector<int64> select_cols_;
-    const bool select_all_cols_;
     const bool use_quote_delim_;
     const char delim_;
     const string na_value_;
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c483a43769..523d1f2f71 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -128,6 +128,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/contrib/data/python/ops:readers",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index 8c138c7081..74b90ec7d1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -25,6 +25,7 @@ import time
 
 import numpy as np
 
+from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import readers as core_readers
@@ -61,12 +62,12 @@ class CsvDatasetOpTest(test.TestCase):
         op2 = sess.run(next2)
         self.assertAllEqual(op1, op2)
 
-  def setup_files(self, inputs):
+  def setup_files(self, inputs, linebreak='\n'):
     filenames = []
     for i, ip in enumerate(inputs):
-      fn = os.path.join(self.get_temp_dir(), 'temp_%d.txt' % i)
-      with open(fn, 'w') as f:
-        f.write('\n'.join(ip))
+      fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
+      with open(fn, 'wb') as f:
+        f.write(linebreak.join(ip).encode('utf-8'))
       filenames.append(fn)
     return filenames
 
@@ -86,38 +87,47 @@ class CsvDatasetOpTest(test.TestCase):
           inputs, **kwargs)
       self._assert_datasets_equal(g, dataset_actual, dataset_expected)
 
+  def _verify_output_or_err(self,
+                            sess,
+                            dataset,
+                            expected_output=None,
+                            expected_err_re=None):
+    nxt = dataset.make_one_shot_iterator().get_next()
+    if expected_err_re is None:
+      # Verify that output is expected, without errors
+      expected_output = [[
+          v.encode('utf-8') if isinstance(v, str) else v for v in op
+      ] for op in expected_output]
+      for value in expected_output:
+        op = sess.run(nxt)
+        self.assertAllEqual(op, value)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(nxt)
+    else:
+      # Verify that OpError is produced as expected
+      with self.assertRaisesOpError(expected_err_re):
+        while True:
+          try:
+            sess.run(nxt)
+          except errors.OutOfRangeError:
+            break
+
   def _test_dataset(self,
                     inputs,
                     expected_output=None,
                     expected_err_re=None,
+                    linebreak='\n',
                     **kwargs):
     """Checks that elements produced by CsvDataset match expected output."""
     # Convert str type because py3 tf strings are bytestrings
-    filenames = self.setup_files(inputs)
+    filenames = self.setup_files(inputs, linebreak)
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = readers.CsvDataset(filenames, **kwargs)
-        nxt = dataset.make_one_shot_iterator().get_next()
-        if expected_err_re is None:
-          # Verify that output is expected, without errors
-          expected_output = [[
-              v.encode('utf-8') if isinstance(v, str) else v for v in op
-          ] for op in expected_output]
-          for value in expected_output:
-            op = sess.run(nxt)
-            self.assertAllEqual(op, value)
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(nxt)
-        else:
-          # Verify that OpError is produced as expected
-          with self.assertRaisesOpError(expected_err_re):
-            while True:
-              try:
-                sess.run(nxt)
-              except errors.OutOfRangeError:
-                break
-
-  def testCsvDataset_floatRequired(self):
+        self._verify_output_or_err(sess, dataset, expected_output,
+                                   expected_err_re)
+
+  def testCsvDataset_requiredFields(self):
     record_defaults = [[]] * 4
     inputs = [['1,2,3,4']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
@@ -137,10 +147,36 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
 
-  def testCsvDataset_withQuoted(self):
-    record_defaults = [['']] * 4
-    inputs = [['1.0,2.1,"hello, it is me",4.3', '5.4,6.5,goodbye,8.7']]
-    self._test_by_comparison(inputs, record_defaults=record_defaults)
+  def testCsvDataset_withEmptyFields(self):
+    record_defaults = [[0]] * 4
+    inputs = [[',,,', '1,1,1,', ',2,2,2']]
+    self._test_dataset(
+        inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errWithUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Unquoted fields cannot have quotes inside',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_ignoreErrWithUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4', 'a,b,c"d', 'e,f,g']]
+    filenames = self.setup_files(inputs)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
+        dataset = dataset.apply(error_ops.ignore_errors())
+        self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
+
+  def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, use_quote_delim=False)
 
   def testCsvDataset_mixedTypes(self):
     record_defaults = [
@@ -164,11 +200,6 @@ class CsvDatasetOpTest(test.TestCase):
     self._test_by_comparison(
         inputs, record_defaults=record_defaults, field_delim=':')
 
-  def testCsvDataset_withEmptyValues(self):
-    record_defaults = [[0]] * 4
-    inputs = [['1,,3,4', ',6,7,8']]
-    self._test_by_comparison(inputs, record_defaults=record_defaults)
-
   def testCsvDataset_withNaValue(self):
     record_defaults = [[0]] * 4
     inputs = [['1,NA,3,4', 'NA,6,7,8']]
@@ -176,8 +207,8 @@ class CsvDatasetOpTest(test.TestCase):
         inputs, record_defaults=record_defaults, na_value='NA')
 
   def testCsvDataset_withSelectCols(self):
-    record_defaults = [[0]] * 2
-    inputs = [['1,2,3,4', '5,6,7,8']]
+    record_defaults = [['']] * 2
+    inputs = [['1,2,3,4', '"5","6","7","8"']]
     self._test_by_comparison(
         inputs, record_defaults=record_defaults, select_cols=[1, 2])
 
@@ -190,27 +221,17 @@ class CsvDatasetOpTest(test.TestCase):
         record_defaults=record_defaults,
         select_cols=[3, 4])
 
+  def testCsvDataset_withOneCol(self):
+    record_defaults = [['NA']]
+    inputs = [['0', '', '2']]
+    self._test_dataset(
+        inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults)
+
   def testCsvDataset_withMultipleFiles(self):
     record_defaults = [[0]] * 4
     inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
 
-  def testCsvDataset_withNewLine(self):
-    # In this case, we expect it to behave differently from
-    # TextLineDataset->map(decode_csv) since that flow has bugs
-    record_defaults = [['']] * 4
-    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
-    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
-    self._test_dataset(inputs, expected, record_defaults=record_defaults)
-
-  def testCsvDataset_withMultipleNewLines(self):
-    # In this case, we expect it to behave differently from
-    # TextLineDataset->map(decode_csv) since that flow has bugs
-    record_defaults = [['']] * 4
-    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
-    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
-    self._test_dataset(inputs, expected, record_defaults=record_defaults)
-
   def testCsvDataset_withLeadingAndTrailingSpaces(self):
     record_defaults = [[0.0]] * 4
     inputs = [['0, 1, 2, 3']]
@@ -266,9 +287,10 @@ class CsvDatasetOpTest(test.TestCase):
   def testCsvDataset_errorWithHeaderEmptyFile(self):
     record_defaults = [[0]] * 2
     inputs = [[]]
+    expected_err_re = "Can't read header of file"
     self._test_dataset(
         inputs,
-        expected_err_re="Can't read header of empty file",
+        expected_err_re=expected_err_re,
         record_defaults=record_defaults,
         header=True,
     )
@@ -284,7 +306,7 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['', '1,2']]  # First record is empty
     self._test_dataset(
         inputs,
-        expected_err_re='Expect 2 fields but have 0 in record',
+        expected_err_re='Expect 2 fields but have 1 in record',
         record_defaults=record_defaults)
 
   def testCsvDataset_withChainedOps(self):
@@ -301,7 +323,7 @@ class CsvDatasetOpTest(test.TestCase):
 
   def testCsvDataset_withTypeDefaults(self):
     # Testing using dtypes as record_defaults for required fields
-    record_defaults = [dtypes.float32, dtypes.float32]
+    record_defaults = [dtypes.float32, [0.0]]
     inputs = [['1.0,2.0', '3.0,4.0']]
     self._test_dataset(
         inputs,
@@ -326,6 +348,162 @@ class CsvDatasetOpTest(test.TestCase):
 
     self.assertEqual(result, sorted(result))
 
+## The following tests exercise parsing logic for quoted fields
+
+  def testCsvDataset_withQuoted(self):
+    record_defaults = [['']] * 4
+    inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withOneColAndQuotes(self):
+    record_defaults = [['']]
+    inputs = [['"0"', '"1"', '"2"']]
+    self._test_dataset(
+        inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLine(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLineInUnselectedCol(self):
+    record_defaults = [['']]
+    inputs = [['1,"2\n3",4', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_output=[['1'], ['5']],
+        record_defaults=record_defaults,
+        select_cols=[0])
+
+  def testCsvDataset_withMultipleNewLines(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithTerminateMidRecord(self):
+    record_defaults = [['']] * 4
+    inputs = [['a,b,c,"a']]
+    self._test_dataset(
+        inputs,
+        expected_err_re=
+        'Reached end of file without closing quoted field in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withEscapedQuotes(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+
+## Testing that parsing works with all buffer sizes, quoted/unquoted fields,
+## and different types of line breaks
+
+  def testCsvDataset_withInvalidBufferSize(self):
+    record_defaults = [['']] * 4
+    inputs = [['a,b,c,d']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='buffer_size should be positive',
+        record_defaults=record_defaults,
+        buffer_size=0)
+
+  def testCsvDataset_withBufferSize(self):
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs, expected, record_defaults=record_defaults, buffer_size=i + 1)
+
+  def testCsvDataset_withCR(self):
+    # Test that when the line separator is '\r', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+
+  def testCsvDataset_withCRLF(self):
+    # Test that when the line separator is '\r\n', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+
+  def testCsvDataset_withBufferSizeAndQuoted(self):
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\n', record_defaults=record_defaults)
+
+  def testCsvDataset_withCRAndQuoted(self):
+    # Test that when the line separator is '\r', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\r', record_defaults=record_defaults)
+
+  def testCsvDataset_withCRLFAndQuoted(self):
+    # Test that when the line separator is '\r\n', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\r\n', record_defaults=record_defaults)
+
 
 class CsvDatasetBenchmark(test.Benchmark):
   """Benchmarks for the various ways of creating a dataset from CSV files.
@@ -343,7 +521,7 @@ class CsvDatasetBenchmark(test.Benchmark):
     self._filenames = []
     for n in self._num_cols:
       fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
-      with open(fn, 'w') as f:
+      with open(fn, 'wb') as f:
         # Just write 100 rows and use `repeat`... Assumes the cost
         # of creating an iterator is not significant
         row = ','.join([str_val for _ in range(n)])
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 987e4fe733..f18c6dc709 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -345,6 +345,19 @@ bool safe_strtof(const char* str, float* value) {
   return processed_characters_count > 0;
 }
 
+bool safe_strtof(StringPiece str, float* value) {
+  int processed_characters_count = -1;
+  auto len = str.size();
+
+  // If string length exceeds buffer size or int max, fail.
+  if (len >= kFastToBufferSize) return false;
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToFloat(
+      str.data(), static_cast<int>(len), &processed_characters_count);
+  return processed_characters_count > 0;
+}
+
 bool safe_strtod(const char* str, double* value) {
   int processed_characters_count = -1;
   auto len = str_util::Strnlen(str, kFastToBufferSize);
@@ -359,6 +372,19 @@ bool safe_strtod(const char* str, double* value) {
   return processed_characters_count > 0;
 }
 
+bool safe_strtod(StringPiece str, double* value) {
+  int processed_characters_count = -1;
+  auto len = str.size();
+
+  // If string length exceeds buffer size or int max, fail.
+  if (len >= kFastToBufferSize) return false;
+  if (len > std::numeric_limits<int>::max()) return false;
+
+  *value = StringToFloatConverter().StringToDouble(
+      str.data(), static_cast<int>(len), &processed_characters_count);
+  return processed_characters_count > 0;
+}
+
 size_t FloatToBuffer(float value, char* buffer) {
   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
   // platforms these days.  Just in case some system exists where FLT_DIG
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 9cb56415cb..f62584dedb 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -116,12 +116,14 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtof(const char* str, float* value);
+bool safe_strtof(StringPiece str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtod(const char* str, double* value);
+bool safe_strtod(StringPiece str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
   return safe_strto32(s, value);
-- 
GitLab


From ecce06cd1ca091d90cd3eaafd5edbc9e3bd9e5f6 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 31 May 2018 18:31:23 -0700
Subject: [PATCH 143/610] Fix lite.py Python TypeError.

---
 tensorflow/contrib/lite/python/lite.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 253b5eadf3..0fc7958d41 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -33,6 +33,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import six
+
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
@@ -188,6 +190,12 @@ class TocoConverter(object):
       except (_text_format.ParseError, DecodeError):
         try:
           print("Ignore 'tcmalloc: large alloc' warnings.")
+
+          if not isinstance(file_content, str):
+            if six.PY3:
+              file_content = file_content.decode('utf-8')
+            else:
+              file_content = file_content.encode('utf-8')
           _text_format.Merge(file_content, graph_def)
         except (_text_format.ParseError, DecodeError):
           raise ValueError(
-- 
GitLab


From 16c6cac5c57b632a82bde1117d441ab230414b5c Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 31 May 2018 18:37:27 -0700
Subject: [PATCH 144/610] Raise the test timeout for
 tensorflow/python:warm_starting_util_test due to flakiness.

PiperOrigin-RevId: 198813273
---
 tensorflow/python/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 569403fa9a..a8a514d166 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -4340,7 +4340,7 @@ py_test(
 
 py_test(
     name = "warm_starting_util_test",
-    size = "small",
+    size = "medium",
     srcs = ["training/warm_starting_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-- 
GitLab


From d3095c93fc042cf6200f5552e910804e1f9dc196 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 31 May 2018 19:01:44 -0700
Subject: [PATCH 145/610] Automated g4 rollback of changelist 198812512

PiperOrigin-RevId: 198815200
---
 .../contrib/data/kernels/csv_dataset_op.cc    | 542 +++++-------------
 .../contrib/data/python/kernel_tests/BUILD    |   1 -
 .../kernel_tests/csv_dataset_op_test.py       | 292 ++--------
 tensorflow/core/lib/strings/numbers.cc        |  26 -
 tensorflow/core/lib/strings/numbers.h         |   2 -
 5 files changed, 203 insertions(+), 660 deletions(-)

diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index e88ad3dc32..97cc0bc6c9 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 
 namespace tensorflow {
@@ -102,11 +103,12 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(
         ctx, select_cols.empty() || select_cols.front() >= 0,
         errors::InvalidArgument("select_cols should be non-negative indices"));
+    bool select_all_cols = select_cols.empty();
 
-    *output = new Dataset(ctx, std::move(filenames), header, buffer_size,
-                          output_types_, output_shapes_,
-                          std::move(record_defaults), std::move(select_cols),
-                          use_quote_delim, delim[0], std::move(na_value));
+    *output = new Dataset(
+        ctx, std::move(filenames), header, buffer_size, output_types_,
+        output_shapes_, std::move(record_defaults), std::move(select_cols),
+        select_all_cols, use_quote_delim, delim[0], std::move(na_value));
   }
 
  private:
@@ -116,7 +118,8 @@ class CSVDatasetOp : public DatasetOpKernel {
             int64 buffer_size, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
-            bool use_quote_delim, char delim, string na_value)
+            bool select_all_cols, bool use_quote_delim, char delim,
+            string na_value)
         : GraphDatasetBase(ctx),
           filenames_(std::move(filenames)),
           header_(header),
@@ -125,6 +128,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           output_shapes_(output_shapes),
           record_defaults_(std::move(record_defaults)),
           select_cols_(std::move(select_cols)),
+          select_all_cols_(select_all_cols),
           use_quote_delim_(use_quote_delim),
           delim_(delim),
           na_value_(std::move(na_value)) {}
@@ -162,24 +166,11 @@ class CSVDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
-        bool select_all = dataset()->select_cols_.empty();
         do {
           // We are currently processing a file, so try to read the next record
-          if (input_stream_) {
-            Status s = ReadRecord(ctx, out_tensors, select_all,
-                                  dataset()->select_cols_);
-            if (s.ok()) {
-              // Validate output
-              if (out_tensors->size() != dataset()->out_type_.size()) {
-                return errors::InvalidArgument(
-                    "Expect ", dataset()->out_type_.size(), " fields but have ",
-                    out_tensors->size(), " in record");
-              }
-
-              *end_of_sequence = false;
-              return s;
-            }
-            if (!errors::IsOutOfRange(s)) {
+          if (buffered_input_stream_) {
+            Status s = ReadRecord(ctx, out_tensors);
+            if (s.ok() || !errors::IsOutOfRange(s)) {
               // Not at the end of file, return OK or non-EOF errors to caller.
               *end_of_sequence = false;
               return s;
@@ -212,341 +203,145 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
      private:
-      // Reads an entire CSV row from the input stream, either from the
-      // existing buffer or by filling the buffer as needed. Converts extracted
+      // Reads a record by parsing the input buffer, and converting extracted
       // fields to output tensors as we go.
-      //
-      // When this function is called, pos_ should be the index of the first
-      // character of the record in buffer_, or past the end of the buffer.
-      // Note: ctx and out_tensors are only used in this function
-      // when fields are included in the record.
-      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
-                        bool select_all, const std::vector<int64>& selected)
+      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (pos_ >= buffer_.size()) {
-          // At the end of the file, this will return errors::OutOfRange
-          TF_RETURN_IF_ERROR(FillBuffer(&buffer_));
-          pos_ = 0;
-        }
-
-        // The first character may be \n if this is the continuation of a
-        // \r\n linebreak between this and the previous record. If so, skip it.
-
-        bool end_of_record = false;  // Keep track of when we find \n, \r or EOF
-        size_t num_parsed = 0;
-        size_t num_selected_parsed = 0;
-
-        Status result = Status::OK();
-
-        while (!end_of_record) {  // Read till we reach \n, \r or EOF
-          bool include =
-              select_all || (num_selected_parsed < selected.size() &&
-                             selected[num_selected_parsed] == num_parsed);
-
-          // Don't fail fast, so that the next call to GetNext may still return
-          // a valid record
-          result.Update(
-              ParseOneField(ctx, out_tensors, &end_of_record, include));
-
-          num_parsed++;
-          if (include) num_selected_parsed++;
-        }
-
-        return result;
-      }
-
-      // Parses one field from position pos_ in the buffer. Fields are
-      // delimited by delim, CRLF, or EOF. Advances pos_ to the first char of
-      // the next field.
-      Status ParseOneField(IteratorContext* ctx,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_record, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (pos_ >= buffer_.size()) {
-          // If we get here, this means the previous field's end coincided
-          // with the end of the buffer. We can fill the buffer without abandon.
-          Status s = FillBuffer(&buffer_);
-
-          if (errors::IsOutOfRange(s)) {
-            // Reached EOF, and last field is empty
-            *end_of_record = true;
-            if (include) {
-              return FieldToOutput(ctx, StringPiece(), out_tensors);
-            } else {
-              return Status::OK();
-            }
-          } else if (!s.ok()) {
-            return s;  // Surface other errors back to caller
+        // Extracts fields from line(s) from the buffered input stream.
+        out_tensors->reserve(dataset()->record_defaults_.size());
+
+        string input;
+        TF_RETURN_IF_ERROR(buffered_input_stream_->ReadLine(&input));
+
+        size_t current_idx = 0;
+        size_t num_fields_parsed = 0;
+        size_t selector_idx = 0;  // Keep track of index into select_cols
+
+        while (current_idx < input.size()) {
+          // In each iteration, parse one field
+          if (input[current_idx] == '\n' || input[current_idx] == '\r') {
+            // This should never happen, because buffered input reader splits
+            // input on newlines.
+            return errors::InvalidArgument("Parsing error.");
           }
 
-          pos_ = 0;
-        }
-
-        if (dataset()->use_quote_delim_ && buffer_[pos_] == '"') {
-          return ParseQuotedField(ctx, out_tensors, end_of_record, include);
-        }
-
-        return ParseUnquotedField(ctx, out_tensors, end_of_record, include);
-      }
-
-      // For keeping track of relevant parts of a field from a previous buffer
-      struct Piece {
-        size_t start;
-        size_t len;
-        string buffer;
-
-        Piece(string buffer, size_t start, size_t len)
-            : start(start), len(len), buffer(std::move(buffer)) {}
-      };
-
-      // Given that pos_ exceeds the buffer, saves the relevant part of the
-      // current buffer (if necessary), fills the buffer, and resets indices to
-      // 0.
-      Status SaveAndFillBuffer(std::vector<Piece>* earlier_pieces,
-                               size_t* start, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        string temp_buffer;
-
-        buffer_.swap(temp_buffer);
-        if (include && pos_ > *start) {
-          earlier_pieces->push_back(
-              Piece(std::move(temp_buffer), *start, pos_ - *start));
-        }
-        pos_ = 0;
-        *start = 0;
-        return FillBuffer(&buffer_);
-      }
+          bool quoted = false;
+          bool include =
+              (dataset()->select_all_cols_ ||
+               dataset()->select_cols_[selector_idx] == num_fields_parsed);
 
-      // Parses unquoted field from position pos_ in the buffer. Continually
-      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
-      // Advances pos_ to keep track of our position in the buffer as we go,
-      // stopping at the first character of the next field.
-      Status ParseQuotedField(IteratorContext* ctx,
-                              std::vector<Tensor>* out_tensors,
-                              bool* end_of_record, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        std::vector<Piece> earlier_pieces;
-        size_t start = pos_;
-        pos_++;  // Starting quotation mark
-
-        while (true) {  // Each iter reads 1 char, filling buffer if necessary
-          if (pos_ >= buffer_.size()) {
-            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
-            if (errors::IsOutOfRange(s)) {
-              return errors::InvalidArgument(
-                  "Reached end of file without closing quoted field in "
-                  "record");
-            } else if (!s.ok()) {
-              return s;  // Surface all other errors to caller
-            }
+          if (dataset()->use_quote_delim_ && input[current_idx] == '"') {
+            quoted = true;
+            current_idx++;
           }
 
-          char ch = buffer_[pos_];
-          if (ch == '"') {
-            // When we encounter a quote, we look ahead to the next character to
-            // decide what to do
-            pos_++;
-            if (pos_ >= buffer_.size()) {
-              Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
-              if (errors::IsOutOfRange(s)) {
-                // This was the last field. We are done
-                *end_of_record = true;
-                return QuotedFieldToOutput(ctx, StringPiece(), out_tensors,
-                                           earlier_pieces, include);
-              } else if (!s.ok()) {
-                return s;
+          // Parse the body of the field
+          string field;
+          if (!quoted) {
+            while (current_idx < input.size() &&
+                   input[current_idx] != dataset()->delim_) {
+              if ((dataset()->use_quote_delim_ && input[current_idx] == '"') ||
+                  input[current_idx] == '\n' || input[current_idx] == '\r') {
+                return errors::InvalidArgument(
+                    "Unquoted fields cannot have quotes/CRLFs inside");
               }
-            }
-
-            char next = buffer_[pos_];
-            pos_++;
-            if (next == dataset()->delim_) {
-              return QuotedFieldToOutput(
-                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
-                  out_tensors, earlier_pieces, include);
-
-            } else if (next == '\n' || next == '\r') {
-              *end_of_record = true;
-              Status s = QuotedFieldToOutput(
-                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
-                  out_tensors, earlier_pieces, include);
-              if (next == '\r') SkipNewLineIfNecessary();
-              return s;
-            } else if (next != '"') {
-              return errors::InvalidArgument(
-                  "Quote inside a string has to be escaped by another quote");
-            }
+              if (include) field += input[current_idx];
+              current_idx++;
+            }  // Exit condition: end of input, or current index at delim
 
+            // Go to next field or the end
+            current_idx++;
           } else {
-            pos_++;
-          }
-        }
-      }
-
-      // Converts quoted field to an output tensor, removing the starting
-      // and ending quotes from it and unescaping double quotations if
-      // necessary.
-      Status QuotedFieldToOutput(IteratorContext* ctx, StringPiece field,
-                                 std::vector<Tensor>* out_tensors,
-                                 const std::vector<Piece>& earlier_pieces,
-                                 bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!include) return Status::OK();
-
-        if (earlier_pieces.empty()) {
-          if (field.find('\"', 1) == field.size() - 1) {
-            // `field` contains no escaped quotation marks.
-            // Exclude framing quotation marks
-            field.remove_prefix(1);
-            field.remove_suffix(1);
-            return FieldToOutput(ctx, field, out_tensors);
-          }
-        }
-        string field_complete;
-        size_t str_len = field.size();
-        for (const Piece& p : earlier_pieces) {
-          str_len += p.len;
-        }
-        field_complete.reserve(str_len);
-
-        // This bool flips every time we see a quote, so that we skip the second
-        // quote of every pair of adjacent quotes in the field. We need to track
-        // this across iterations of the for loop because adjacent double quotes
-        // may be in different buffers. Initialize to true because we also skip
-        // the opening quotation mark of the quoted field.
-        bool skip_next_quote = true;
-        for (const Piece& p : earlier_pieces) {
-          AppendUnescapedPiece(StringPiece(&p.buffer[p.start], p.len),
-                               &field_complete, &skip_next_quote);
-        }
-        AppendUnescapedPiece(field, &field_complete, &skip_next_quote);
-        StringPiece result = StringPiece(field_complete);
-        result.remove_suffix(1);  // Skip final quote
-
-        return FieldToOutput(ctx, result, out_tensors);
-      }
-
-      void AppendUnescapedPiece(StringPiece piece, string* field_complete,
-                                bool* skip_next_quote) {
-        size_t from = 0;
-        size_t found = piece.find('\"', from);
-        while (found != string::npos) {
-          if (!*skip_next_quote) {
-            // This is the first quote in a pair of adjacent double quotes
-            field_complete->append(piece.data() + from, found + 1 - from);
-          }
-          *skip_next_quote = !*skip_next_quote;
-          from = found + 1;
-          found = piece.find('\"', from);
-        }
-        // Include the chunk after the last quotation mark in the string
-        if (from < piece.size()) {
-          field_complete->append(piece.data() + from, piece.size() - from);
-        }
-      }
-
-      // Parses unquoted field from position pos_ in the buffer. Continually
-      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
-      // Advances pos_ to keep track of our position in the buffer as we go,
-      // stopping at the first character of the next field.
-      Status ParseUnquotedField(IteratorContext* ctx,
-                                std::vector<Tensor>* out_tensors,
-                                bool* end_of_record, bool include)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        std::vector<Piece> earlier_pieces;
-        size_t start = pos_;
-        while (true) {  // Each iter reads 1 char, filling buffer if necessary
-          if (pos_ >= buffer_.size()) {
-            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
-            // Handle errors
-            if (errors::IsOutOfRange(s)) {
-              // Whatever we have is the last field of the last record
-              *end_of_record = true;
-              return UnquotedFieldToOutput(
-                  ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                  earlier_pieces, include);
-            } else if (!s.ok()) {
-              return s;  // Surface all other errors to caller
+            // Quoted field needs to be ended with '"' and delim or end
+            while (true) {
+              if (current_idx >= input.size() - 1 || input.empty()) {
+                if (current_idx == input.size() - 1 &&
+                    input[current_idx] == '"') {
+                  // We're at the end of the input, and the quote terminates the
+                  // record. Go to end.
+                  current_idx++;
+                  break;
+                }
+                // If there's no terminating quote, it means our buffered record
+                // line reader split a record up. This can happen if there is a
+                // newline encased in quotes. The next line is also part of the
+                // record, so we read it and reset the index.
+                if (include && current_idx == input.size() - 1) {
+                  // TODO(rachelim): Instead of building up a string, keep track
+                  //  of terminal indices (or starting char* and length)
+                  // Also look into using /lib/strings/Scanner
+                  field += input[current_idx];
+                }
+                if (include) {
+                  field += '\n';
+                }
+                current_idx = 0;
+                Status s = buffered_input_stream_->ReadLine(&input);
+                if (!s.ok()) {
+                  return errors::InvalidArgument(
+                      "Quoted field has to end with quote followed by delim, "
+                      "CRLF, or EOF");
+                }
+              } else if (input[current_idx] == '"' &&
+                         input[current_idx + 1] == dataset()->delim_) {
+                // End of field, go to next field or end
+                current_idx += 2;
+                break;
+              } else if (input[current_idx] == '"') {
+                // Current char is a quote. Since we're not at end of field,
+                // the next character must also be a quote.
+                if (input[current_idx + 1] != '"') {
+                  return errors::InvalidArgument(
+                      "Quote inside a string has to be escaped by another "
+                      "quote");
+                }
+                if (include) field += '"';
+                current_idx += 2;
+              } else {
+                if (include) field += input[current_idx];
+                current_idx++;
+              }
             }
           }
 
-          char ch = buffer_[pos_];
+          num_fields_parsed++;
 
-          if (ch == dataset()->delim_) {
-            Status s = UnquotedFieldToOutput(
-                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                earlier_pieces, include);
-            pos_++;
-            return s;
-          }
-          if (ch == '\n' || ch == '\r') {
-            // need special case to skip over first \n of record if the line
-            // breaks are \r\n
-            Status s = UnquotedFieldToOutput(
-                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                earlier_pieces, include);
-            *end_of_record = true;
-            pos_++;
-            if (ch == '\r') SkipNewLineIfNecessary();
-            return s;
-          }
-          if (dataset()->use_quote_delim_ && ch == '"') {
-            // Advance pos_ to the next field anyway so that we can ignore
-            // errors gracefully if required. The caller of this will be able to
-            // call ParseOneField and continue with the rest of the record.
-            AdvanceToNextField(end_of_record);
-            return errors::InvalidArgument(
-                "Unquoted fields cannot have quotes inside");
+          if (include) {
+            // Add the tensor to the result
+            TF_RETURN_IF_ERROR(FieldToOutput(ctx, std::move(field),
+                                             selector_idx, out_tensors));
+            selector_idx++;
+            // Terminate early if we have all the fields we want
+            if (selector_idx == dataset()->select_cols_.size())
+              return Status::OK();
           }
-          // Otherwise, go to next character
-          pos_++;
+        }  // Exit condition: current_idx has reached the end of record
+
+        // Check if the last field is empty, and include it if necessary
+        bool include =
+            (dataset()->select_all_cols_ ||
+             dataset()->select_cols_[selector_idx] == num_fields_parsed);
+        if (include && !input.empty() &&
+            input[input.size() - 1] == dataset()->delim_) {
+          TF_RETURN_IF_ERROR(
+              FieldToOutput(ctx, string(), selector_idx, out_tensors));
         }
-      }
-
-      // Advances pos_ to the start of the next field, as delimited by delim,
-      // CRLF, or EOF, ignoring errors, and not keeping track of characters in
-      // the current field.
-      void AdvanceToNextField(bool* end_of_record)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        while (true) {
-          if (pos_ >= buffer_.size()) {
-            Status s = FillBuffer(&buffer_);
-            pos_ = 0;
-            if (!s.ok()) {
-              *end_of_record = true;
-              return;
-            }
-          }
 
-          char ch = buffer_[pos_];
-          pos_++;
-
-          if (ch == dataset()->delim_) {
-            return;
-          }
-
-          if (ch == '\n' || ch == '\r') {
-            *end_of_record = true;
-            if (ch == '\r') SkipNewLineIfNecessary();
-            return;
-          }
-        }
-      }
-
-      Status FillBuffer(string* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        result->clear();
-        Status s = input_stream_->ReadNBytes(dataset()->buffer_size_, result);
-
-        if (errors::IsOutOfRange(s) && !result->empty()) {
-          // Ignore OutOfRange error when ReadNBytes read < N bytes.
-          return Status::OK();
+        // Check that number of fields matches
+        if (out_tensors->size() != dataset()->out_type_.size()) {
+          return errors::InvalidArgument("Expect ", dataset()->out_type_.size(),
+                                         " fields but have ",
+                                         out_tensors->size(), " in record");
         }
-        return s;
+        return Status::OK();
       }
 
-      // Given a field, converts it to the right output tensor type
-      Status FieldToOutput(IteratorContext* ctx, StringPiece field,
+      // Given a string field, and its index in the output,
+      // converts it to a Tensor of the right type and adds it to the
+      // out_tensors vector.
+      Status FieldToOutput(IteratorContext* ctx, string field,
+                           size_t output_idx,
                            std::vector<Tensor>* out_tensors) {
-        size_t output_idx = out_tensors->size();
         if (output_idx >= dataset()->out_type_.size()) {
           // We can get here if we're selecting all columns, but the number of
           // fields exceeds the number of defaults provided
@@ -602,7 +397,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                   dataset()->record_defaults_[output_idx].flat<float>()(0);
             } else {
               float value;
-              if (!strings::safe_strtof(field, &value)) {
+              if (!strings::safe_strtof(field.c_str(), &value)) {
                 return errors::InvalidArgument(
                     "Field ", output_idx,
                     " in record is not a valid float: ", field);
@@ -617,7 +412,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                   dataset()->record_defaults_[output_idx].flat<double>()(0);
             } else {
               double value;
-              if (!strings::safe_strtod(field, &value)) {
+              if (!strings::safe_strtod(field.c_str(), &value)) {
                 return errors::InvalidArgument(
                     "Field ", output_idx,
                     " in record is not a valid double: ", field);
@@ -631,7 +426,7 @@ class CSVDatasetOp : public DatasetOpKernel {
               component.scalar<string>()() =
                   dataset()->record_defaults_[output_idx].flat<string>()(0);
             } else {
-              component.scalar<string>()() = field.ToString();
+              component.scalar<string>()() = std::move(field);
             }
             break;
           }
@@ -644,50 +439,6 @@ class CSVDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
-      // Records can be delimited by "\r\n" line breaks. When we encounter a
-      // '\r', we have to check the next character to see if it is part of the
-      // linebreak, and ignore it if so.
-      void SkipNewLineIfNecessary() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (pos_ >= buffer_.size()) {
-          Status s = FillBuffer(&buffer_);
-          pos_ = 0;
-          // If we failed to fill buffer, it doesn't matter because we're done
-          // with the record
-          if (!s.ok()) return;
-        }
-        if (buffer_[pos_] == '\n') {
-          pos_++;
-        }
-      }
-
-      // Given a string field, and its index in the output,
-      // converts it to a Tensor of the right type and adds it to the
-      // out_tensors vector.
-      Status UnquotedFieldToOutput(IteratorContext* ctx, StringPiece field,
-                                   std::vector<Tensor>* out_tensors,
-                                   const std::vector<Piece>& earlier_pieces,
-                                   bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!include) return Status::OK();
-
-        if (earlier_pieces.empty()) {
-          return FieldToOutput(ctx, field, out_tensors);
-        }
-
-        size_t str_len = field.size();
-        for (const Piece& p : earlier_pieces) {
-          str_len += p.len;
-        }
-        string field_complete;
-        field_complete.reserve(str_len);
-
-        for (const Piece& p : earlier_pieces) {
-          field_complete.append(p.buffer, p.start, p.len);
-        }
-
-        field_complete.append(field.data(), field.size());
-        return FieldToOutput(ctx, field_complete, out_tensors);
-      }
-
       // Sets up reader streams to read from the file at `current_file_index_`.
       Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (current_file_index_ >= dataset()->filenames_.size()) {
@@ -701,18 +452,16 @@ class CSVDatasetOp : public DatasetOpKernel {
             dataset()->filenames_[current_file_index_], &file_));
         input_stream_.reset(
             new io::RandomAccessInputStream(file_.get(), false));
-        buffer_.clear();
-        pos_ = 0;
+        // TODO(rachelim): Maintain our own buffer so we don't read every record
+        //   twice
+        buffered_input_stream_.reset(new io::BufferedInputStream(
+            input_stream_.get(), dataset()->buffer_size_, false));
         if (dataset()->header_) {
-          // Read one line, but don't include it. Pass nullptrs as dummy
-          // pointers to objects that shouldn't be invoked anyway
-          // We need to process this as a record here instead of just finding
-          // the first newline because it might contain quoted fields with
-          // newlines in the header as well
-          std::vector<int64> empty;
-          Status s = ReadRecord(nullptr, nullptr, false, empty);
-          if (!s.ok()) {
-            return errors::InvalidArgument("Can't read header of file");
+          // Ignore header line
+          string str;
+          Status s = buffered_input_stream_->ReadLine(&str);
+          if (errors::IsOutOfRange(s)) {
+            return errors::InvalidArgument("Can't read header of empty file");
           }
         }
         return Status::OK();
@@ -721,15 +470,15 @@ class CSVDatasetOp : public DatasetOpKernel {
       // Resets all reader streams.
       void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         input_stream_.reset();
+        buffered_input_stream_.reset();
         file_.reset();
       }
 
       mutex mu_;
-      string buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
-      size_t pos_ GUARDED_BY(
-          mu_);  // Index into the buffer must be maintained between iters
       std::unique_ptr<io::RandomAccessInputStream> input_stream_
           GUARDED_BY(mu_);
+      std::unique_ptr<io::BufferedInputStream> buffered_input_stream_
+          GUARDED_BY(mu_);
       size_t current_file_index_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<RandomAccessFile> file_
           GUARDED_BY(mu_);  // must outlive input_stream_
@@ -742,6 +491,7 @@ class CSVDatasetOp : public DatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const std::vector<Tensor> record_defaults_;
     const std::vector<int64> select_cols_;
+    const bool select_all_cols_;
     const bool use_quote_delim_;
     const char delim_;
     const string na_value_;
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 523d1f2f71..c483a43769 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -128,7 +128,6 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
-        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/contrib/data/python/ops:readers",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index 74b90ec7d1..8c138c7081 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -25,7 +25,6 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import readers as core_readers
@@ -62,12 +61,12 @@ class CsvDatasetOpTest(test.TestCase):
         op2 = sess.run(next2)
         self.assertAllEqual(op1, op2)
 
-  def setup_files(self, inputs, linebreak='\n'):
+  def setup_files(self, inputs):
     filenames = []
     for i, ip in enumerate(inputs):
-      fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
-      with open(fn, 'wb') as f:
-        f.write(linebreak.join(ip).encode('utf-8'))
+      fn = os.path.join(self.get_temp_dir(), 'temp_%d.txt' % i)
+      with open(fn, 'w') as f:
+        f.write('\n'.join(ip))
       filenames.append(fn)
     return filenames
 
@@ -87,47 +86,38 @@ class CsvDatasetOpTest(test.TestCase):
           inputs, **kwargs)
       self._assert_datasets_equal(g, dataset_actual, dataset_expected)
 
-  def _verify_output_or_err(self,
-                            sess,
-                            dataset,
-                            expected_output=None,
-                            expected_err_re=None):
-    nxt = dataset.make_one_shot_iterator().get_next()
-    if expected_err_re is None:
-      # Verify that output is expected, without errors
-      expected_output = [[
-          v.encode('utf-8') if isinstance(v, str) else v for v in op
-      ] for op in expected_output]
-      for value in expected_output:
-        op = sess.run(nxt)
-        self.assertAllEqual(op, value)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(nxt)
-    else:
-      # Verify that OpError is produced as expected
-      with self.assertRaisesOpError(expected_err_re):
-        while True:
-          try:
-            sess.run(nxt)
-          except errors.OutOfRangeError:
-            break
-
   def _test_dataset(self,
                     inputs,
                     expected_output=None,
                     expected_err_re=None,
-                    linebreak='\n',
                     **kwargs):
     """Checks that elements produced by CsvDataset match expected output."""
     # Convert str type because py3 tf strings are bytestrings
-    filenames = self.setup_files(inputs, linebreak)
+    filenames = self.setup_files(inputs)
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = readers.CsvDataset(filenames, **kwargs)
-        self._verify_output_or_err(sess, dataset, expected_output,
-                                   expected_err_re)
-
-  def testCsvDataset_requiredFields(self):
+        nxt = dataset.make_one_shot_iterator().get_next()
+        if expected_err_re is None:
+          # Verify that output is expected, without errors
+          expected_output = [[
+              v.encode('utf-8') if isinstance(v, str) else v for v in op
+          ] for op in expected_output]
+          for value in expected_output:
+            op = sess.run(nxt)
+            self.assertAllEqual(op, value)
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(nxt)
+        else:
+          # Verify that OpError is produced as expected
+          with self.assertRaisesOpError(expected_err_re):
+            while True:
+              try:
+                sess.run(nxt)
+              except errors.OutOfRangeError:
+                break
+
+  def testCsvDataset_floatRequired(self):
     record_defaults = [[]] * 4
     inputs = [['1,2,3,4']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
@@ -147,36 +137,10 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
 
-  def testCsvDataset_withEmptyFields(self):
-    record_defaults = [[0]] * 4
-    inputs = [[',,,', '1,1,1,', ',2,2,2']]
-    self._test_dataset(
-        inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
-        record_defaults=record_defaults)
-
-  def testCsvDataset_errWithUnquotedQuotes(self):
-    record_defaults = [['']] * 3
-    inputs = [['1,2"3,4']]
-    self._test_dataset(
-        inputs,
-        expected_err_re='Unquoted fields cannot have quotes inside',
-        record_defaults=record_defaults)
-
-  def testCsvDataset_ignoreErrWithUnquotedQuotes(self):
-    record_defaults = [['']] * 3
-    inputs = [['1,2"3,4', 'a,b,c"d', 'e,f,g']]
-    filenames = self.setup_files(inputs)
-    with ops.Graph().as_default() as g:
-      with self.test_session(graph=g) as sess:
-        dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
-        dataset = dataset.apply(error_ops.ignore_errors())
-        self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
-
-  def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self):
-    record_defaults = [['']] * 3
-    inputs = [['1,2"3,4']]
-    self._test_by_comparison(
-        inputs, record_defaults=record_defaults, use_quote_delim=False)
+  def testCsvDataset_withQuoted(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,"hello, it is me",4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
 
   def testCsvDataset_mixedTypes(self):
     record_defaults = [
@@ -200,6 +164,11 @@ class CsvDatasetOpTest(test.TestCase):
     self._test_by_comparison(
         inputs, record_defaults=record_defaults, field_delim=':')
 
+  def testCsvDataset_withEmptyValues(self):
+    record_defaults = [[0]] * 4
+    inputs = [['1,,3,4', ',6,7,8']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
   def testCsvDataset_withNaValue(self):
     record_defaults = [[0]] * 4
     inputs = [['1,NA,3,4', 'NA,6,7,8']]
@@ -207,8 +176,8 @@ class CsvDatasetOpTest(test.TestCase):
         inputs, record_defaults=record_defaults, na_value='NA')
 
   def testCsvDataset_withSelectCols(self):
-    record_defaults = [['']] * 2
-    inputs = [['1,2,3,4', '"5","6","7","8"']]
+    record_defaults = [[0]] * 2
+    inputs = [['1,2,3,4', '5,6,7,8']]
     self._test_by_comparison(
         inputs, record_defaults=record_defaults, select_cols=[1, 2])
 
@@ -221,17 +190,27 @@ class CsvDatasetOpTest(test.TestCase):
         record_defaults=record_defaults,
         select_cols=[3, 4])
 
-  def testCsvDataset_withOneCol(self):
-    record_defaults = [['NA']]
-    inputs = [['0', '', '2']]
-    self._test_dataset(
-        inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults)
-
   def testCsvDataset_withMultipleFiles(self):
     record_defaults = [[0]] * 4
     inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
 
+  def testCsvDataset_withNewLine(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_withMultipleNewLines(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
   def testCsvDataset_withLeadingAndTrailingSpaces(self):
     record_defaults = [[0.0]] * 4
     inputs = [['0, 1, 2, 3']]
@@ -287,10 +266,9 @@ class CsvDatasetOpTest(test.TestCase):
   def testCsvDataset_errorWithHeaderEmptyFile(self):
     record_defaults = [[0]] * 2
     inputs = [[]]
-    expected_err_re = "Can't read header of file"
     self._test_dataset(
         inputs,
-        expected_err_re=expected_err_re,
+        expected_err_re="Can't read header of empty file",
         record_defaults=record_defaults,
         header=True,
     )
@@ -306,7 +284,7 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['', '1,2']]  # First record is empty
     self._test_dataset(
         inputs,
-        expected_err_re='Expect 2 fields but have 1 in record',
+        expected_err_re='Expect 2 fields but have 0 in record',
         record_defaults=record_defaults)
 
   def testCsvDataset_withChainedOps(self):
@@ -323,7 +301,7 @@ class CsvDatasetOpTest(test.TestCase):
 
   def testCsvDataset_withTypeDefaults(self):
     # Testing using dtypes as record_defaults for required fields
-    record_defaults = [dtypes.float32, [0.0]]
+    record_defaults = [dtypes.float32, dtypes.float32]
     inputs = [['1.0,2.0', '3.0,4.0']]
     self._test_dataset(
         inputs,
@@ -348,162 +326,6 @@ class CsvDatasetOpTest(test.TestCase):
 
     self.assertEqual(result, sorted(result))
 
-## The following tests exercise parsing logic for quoted fields
-
-  def testCsvDataset_withQuoted(self):
-    record_defaults = [['']] * 4
-    inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']]
-    self._test_by_comparison(inputs, record_defaults=record_defaults)
-
-  def testCsvDataset_withOneColAndQuotes(self):
-    record_defaults = [['']]
-    inputs = [['"0"', '"1"', '"2"']]
-    self._test_dataset(
-        inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults)
-
-  def testCsvDataset_withNewLine(self):
-    # In this case, we expect it to behave differently from
-    # TextLineDataset->map(decode_csv) since that flow has bugs
-    record_defaults = [['']] * 4
-    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
-    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
-    self._test_dataset(inputs, expected, record_defaults=record_defaults)
-
-  def testCsvDataset_withNewLineInUnselectedCol(self):
-    record_defaults = [['']]
-    inputs = [['1,"2\n3",4', '5,6,7']]
-    self._test_dataset(
-        inputs,
-        expected_output=[['1'], ['5']],
-        record_defaults=record_defaults,
-        select_cols=[0])
-
-  def testCsvDataset_withMultipleNewLines(self):
-    # In this case, we expect it to behave differently from
-    # TextLineDataset->map(decode_csv) since that flow has bugs
-    record_defaults = [['']] * 4
-    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
-    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
-    self._test_dataset(inputs, expected, record_defaults=record_defaults)
-
-  def testCsvDataset_errorWithTerminateMidRecord(self):
-    record_defaults = [['']] * 4
-    inputs = [['a,b,c,"a']]
-    self._test_dataset(
-        inputs,
-        expected_err_re=
-        'Reached end of file without closing quoted field in record',
-        record_defaults=record_defaults)
-
-  def testCsvDataset_withEscapedQuotes(self):
-    record_defaults = [['']] * 4
-    inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']]
-    self._test_by_comparison(inputs, record_defaults=record_defaults)
-
-
-## Testing that parsing works with all buffer sizes, quoted/unquoted fields,
-## and different types of line breaks
-
-  def testCsvDataset_withInvalidBufferSize(self):
-    record_defaults = [['']] * 4
-    inputs = [['a,b,c,d']]
-    self._test_dataset(
-        inputs,
-        expected_err_re='buffer_size should be positive',
-        record_defaults=record_defaults,
-        buffer_size=0)
-
-  def testCsvDataset_withBufferSize(self):
-    record_defaults = [['NA']] * 3
-    inputs = [['abc,def,ghi', '0,1,2', ',,']]
-    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
-    for i in range(20):
-      # Test a range of buffer sizes that should all work
-      self._test_dataset(
-          inputs, expected, record_defaults=record_defaults, buffer_size=i + 1)
-
-  def testCsvDataset_withCR(self):
-    # Test that when the line separator is '\r', parsing works with all buffer
-    # sizes
-    record_defaults = [['NA']] * 3
-    inputs = [['abc,def,ghi', '0,1,2', ',,']]
-    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
-    for i in range(20):
-      # Test a range of buffer sizes that should all work
-      self._test_dataset(
-          inputs,
-          expected,
-          linebreak='\r',
-          record_defaults=record_defaults,
-          buffer_size=i + 1)
-
-  def testCsvDataset_withCRLF(self):
-    # Test that when the line separator is '\r\n', parsing works with all buffer
-    # sizes
-    record_defaults = [['NA']] * 3
-    inputs = [['abc,def,ghi', '0,1,2', ',,']]
-    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
-    for i in range(20):
-      # Test a range of buffer sizes that should all work
-      self._test_dataset(
-          inputs,
-          expected,
-          linebreak='\r\n',
-          record_defaults=record_defaults,
-          buffer_size=i + 1)
-
-  def testCsvDataset_withBufferSizeAndQuoted(self):
-    record_defaults = [['NA']] * 3
-    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
-    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
-                ['NA', 'NA', 'NA']]
-    for i in range(20):
-      # Test a range of buffer sizes that should all work
-      self._test_dataset(
-          inputs,
-          expected,
-          linebreak='\n',
-          record_defaults=record_defaults,
-          buffer_size=i + 1)
-    self._test_dataset(
-        inputs, expected, linebreak='\n', record_defaults=record_defaults)
-
-  def testCsvDataset_withCRAndQuoted(self):
-    # Test that when the line separator is '\r', parsing works with all buffer
-    # sizes
-    record_defaults = [['NA']] * 3
-    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
-    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
-                ['NA', 'NA', 'NA']]
-    for i in range(20):
-      # Test a range of buffer sizes that should all work
-      self._test_dataset(
-          inputs,
-          expected,
-          linebreak='\r',
-          record_defaults=record_defaults,
-          buffer_size=i + 1)
-    self._test_dataset(
-        inputs, expected, linebreak='\r', record_defaults=record_defaults)
-
-  def testCsvDataset_withCRLFAndQuoted(self):
-    # Test that when the line separator is '\r\n', parsing works with all buffer
-    # sizes
-    record_defaults = [['NA']] * 3
-    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
-    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
-                ['NA', 'NA', 'NA']]
-    for i in range(20):
-      # Test a range of buffer sizes that should all work
-      self._test_dataset(
-          inputs,
-          expected,
-          linebreak='\r\n',
-          record_defaults=record_defaults,
-          buffer_size=i + 1)
-    self._test_dataset(
-        inputs, expected, linebreak='\r\n', record_defaults=record_defaults)
-
 
 class CsvDatasetBenchmark(test.Benchmark):
   """Benchmarks for the various ways of creating a dataset from CSV files.
@@ -521,7 +343,7 @@ class CsvDatasetBenchmark(test.Benchmark):
     self._filenames = []
     for n in self._num_cols:
       fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
-      with open(fn, 'wb') as f:
+      with open(fn, 'w') as f:
         # Just write 100 rows and use `repeat`... Assumes the cost
         # of creating an iterator is not significant
         row = ','.join([str_val for _ in range(n)])
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index f18c6dc709..987e4fe733 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -345,19 +345,6 @@ bool safe_strtof(const char* str, float* value) {
   return processed_characters_count > 0;
 }
 
-bool safe_strtof(StringPiece str, float* value) {
-  int processed_characters_count = -1;
-  auto len = str.size();
-
-  // If string length exceeds buffer size or int max, fail.
-  if (len >= kFastToBufferSize) return false;
-  if (len > std::numeric_limits<int>::max()) return false;
-
-  *value = StringToFloatConverter().StringToFloat(
-      str.data(), static_cast<int>(len), &processed_characters_count);
-  return processed_characters_count > 0;
-}
-
 bool safe_strtod(const char* str, double* value) {
   int processed_characters_count = -1;
   auto len = str_util::Strnlen(str, kFastToBufferSize);
@@ -372,19 +359,6 @@ bool safe_strtod(const char* str, double* value) {
   return processed_characters_count > 0;
 }
 
-bool safe_strtod(StringPiece str, double* value) {
-  int processed_characters_count = -1;
-  auto len = str.size();
-
-  // If string length exceeds buffer size or int max, fail.
-  if (len >= kFastToBufferSize) return false;
-  if (len > std::numeric_limits<int>::max()) return false;
-
-  *value = StringToFloatConverter().StringToDouble(
-      str.data(), static_cast<int>(len), &processed_characters_count);
-  return processed_characters_count > 0;
-}
-
 size_t FloatToBuffer(float value, char* buffer) {
   // FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
   // platforms these days.  Just in case some system exists where FLT_DIG
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index f62584dedb..9cb56415cb 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -116,14 +116,12 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtof(const char* str, float* value);
-bool safe_strtof(StringPiece str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
 bool safe_strtod(const char* str, double* value);
-bool safe_strtod(StringPiece str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
   return safe_strto32(s, value);
-- 
GitLab


From 3df9efb6fd65d7cf1249f9cad44c53d7f0a142b9 Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Thu, 31 May 2018 19:03:21 -0700
Subject: [PATCH 146/610] Add a single positional argument mode for shape
 inference in subclassed Models.

Allows fit() when call's signature looks something like call(x, training=True).

Calling conventions are "inputs", single positional, and multiple positional. Right now the distinction between "inputs" and single positional calling conventions is the text of one error message. Both support shape inference (which just hasn't been implemented for multiple positional input arguments yet).

PiperOrigin-RevId: 198815483
---
 tensorflow/python/keras/engine/base_layer.py  | 45 ++++++++++++++---
 tensorflow/python/keras/engine/network.py     | 50 ++++++++++++++++---
 tensorflow/python/keras/engine/training.py    | 27 ++++++----
 .../python/keras/model_subclassing_test.py    |  4 +-
 4 files changed, 98 insertions(+), 28 deletions(-)

diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 24716cfbe4..4814275fd5 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import enum  # pylint: disable=g-bad-import-order
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
 import numpy as np
@@ -50,6 +51,20 @@ from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.tf_export import tf_export
 
 
+class CallConvention(enum.Enum):
+  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
+  # The Layer takes inputs as its first argument, named "inputs" for
+  # compatibility with the signature of Layer.__call__. This is the mode assumed
+  # for Layers which are not subclassed Models.
+  EXPLICIT_INPUTS_ARGUMENT = 1
+  # The Layer takes a single positional argument, not named "inputs". It's
+  # treated like an "inputs" argument.
+  SINGLE_POSITIONAL_ARGUMENT = 2
+  # The Layer has multiple positional arguments to which its inputs should be
+  # bound.
+  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
+
+
 @tf_export('keras.layers.Layer')
 class Layer(checkpointable.CheckpointableBase):
   """Base layer class.
@@ -149,7 +164,7 @@ class Layer(checkpointable.CheckpointableBase):
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
-    self._uses_inputs_arg = True
+    self._call_convention = CallConvention.EXPLICIT_INPUTS_ARGUMENT
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
@@ -793,12 +808,22 @@ class Layer(checkpointable.CheckpointableBase):
           pass  # C type such as dict. Masking not supported in this case.
 
   def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-    if args and getattr(self, '_uses_inputs_arg', True):
-      raise TypeError(
-          'This Layer takes an `inputs` argument to call(), and only the '
-          '`inputs` argument may be specified as a positional argument. '
-          'Pass everything else as a keyword argument (those arguments will'
-          ' not be tracked as inputs to the Layer).')
+    call_convention = getattr(self, '_call_convention',
+                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if args:
+      if call_convention == CallConvention.EXPLICIT_INPUTS_ARGUMENT:
+        raise TypeError(
+            'This Layer takes an `inputs` argument to call(), and only the '
+            '`inputs` argument may be specified as a positional argument. '
+            'Pass everything else as a keyword argument (those arguments will'
+            ' not be tracked as inputs to the Layer).')
+      elif call_convention == CallConvention.SINGLE_POSITIONAL_ARGUMENT:
+        raise TypeError(
+            'This Layer takes a single positional argument to call(), which is '
+            'by convention the inputs argument, and only this argument may be '
+            'specified as a positional argument. Pass everything else as a '
+            'keyword argument (those arguments will not be tracked as inputs '
+            'to the Layer).')
 
     # If the layer returns tensors from its inputs, unmodified,
     # we copy them to avoid loss of tensor metadata.
@@ -834,7 +859,11 @@ class Layer(checkpointable.CheckpointableBase):
       A tuple of (inputs, non_input_kwargs). These may be the same objects as
       were passed in (call_args and call_kwargs).
     """
-    if getattr(self, '_uses_inputs_arg', True):
+    call_convention = getattr(self, '_call_convention',
+                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if (call_convention in (
+        CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
       assert len(call_args) == 1  # TypeError raised earlier in __call__.
       return call_args[0], call_kwargs
     else:
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index f63ca1a207..d43aba6875 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -134,7 +134,7 @@ class Network(base_layer.Layer):
     self._in_progress_restore_finalizer = None
 
   def _init_graph_network(self, inputs, outputs, name=None):
-    self._uses_inputs_arg = True
+    self._call_convention = base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -294,19 +294,55 @@ class Network(base_layer.Layer):
   def _init_subclassed_network(self, name=None):
     self._base_init(name=name)
     self._is_graph_network = False
-    call_args = tf_inspect.getargspec(self.call).args
-    if 'training' in call_args:
+    call_argspec = tf_inspect.getargspec(self.call)
+    if 'training' in call_argspec.args:
       self._expects_training_arg = True
     else:
       self._expects_training_arg = False
-    if 'inputs' in call_args:
-      self._uses_inputs_arg = True
-    else:
-      self._uses_inputs_arg = False
+    self._call_convention = self._determine_call_convention(call_argspec)
     self.outputs = None
     self.inputs = None
     self.built = False
 
+  def _determine_call_convention(self, call_argspec):
+    """Decides how `self.call()` is invoked. See base_layer.CallConvention."""
+    if call_argspec.varargs:
+      may_take_single_argument = False
+    else:
+      try:
+        # Note: tf_inspect doesn't raise a TypeError when regular inspect would,
+        # so we need to keep in mind that "getcallargs" may have returned
+        # something even though we under-specified positional arguments.
+        all_args = tf_inspect.getcallargs(self.call, None)
+        self_args = set()
+        for arg_name, obj in all_args.items():
+          if obj is self:
+            self_args.add(arg_name)
+        may_take_single_argument = True
+      except TypeError:
+        may_take_single_argument = False
+    if may_take_single_argument:
+      # A single positional argument (plus "self") is considered equivalent to
+      # an "inputs" argument.
+      all_positional_args = len(call_argspec.args)
+      if call_argspec.defaults is not None:
+        all_positional_args -= len(call_argspec.defaults)
+      non_self_positional_args = all_positional_args
+      for positional_arg_name in call_argspec.args[:all_positional_args]:
+        if positional_arg_name in self_args:
+          non_self_positional_args -= 1
+      if non_self_positional_args == 1:
+        if 'inputs' in call_argspec.args[all_positional_args:]:
+          raise TypeError(
+              "Model.call() takes a single positional argument (to which "
+              "inputs are passed by convention) and a separate 'inputs' "
+              "argument. Unable to determine which arguments are inputs.")
+        return base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT
+    if 'inputs' in call_argspec.args:
+      return base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    else:
+      return base_layer.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
+
   def _track_layers(self, layers):
     """Add Checkpointable dependencies on a list of Layers."""
     weight_layer_index = 0
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 6d625f16c2..04a2aa7664 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -31,12 +31,11 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_eager
 from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.keras.engine import training_utils
-from tensorflow.python.keras.engine.base_layer import DeferredTensor
-from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
 from tensorflow.python.ops import array_ops
@@ -523,7 +522,7 @@ class Model(Network):
 
             # Keep track of state updates created by
             # stateful metrics (i.e. metrics layers).
-            if isinstance(metric_fn, Layer) and metric_fn.stateful:
+            if isinstance(metric_fn, base_layer.Layer) and metric_fn.stateful:
               self.stateful_metric_names.append(metric_name)
               self.stateful_metric_functions.append(metric_fn)
               self.metrics_updates += metric_fn.updates
@@ -959,11 +958,17 @@ class Model(Network):
         whether to build the model's graph in inference mode (False), training
         mode (True), or using the Keras learning phase (None).
     """
-    if not getattr(self, '_uses_inputs_arg', True):
+    call_convention = getattr(
+        self,
+        '_call_convention',
+        base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if call_convention not in (
+        base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT):
       raise NotImplementedError(
-          'Subclassed Models without "inputs" in their call() signatures do '
-          'not yet support shape inference. File a feature request if this '
-          'limitation bothers you.')
+          'Subclassed Models without "inputs" (or single positional arguments) '
+          'in their call() signatures do not yet support shape inference. File '
+          'a feature request if this limitation bothers you.')
     if self.__class__.__name__ == 'Sequential':
       # Note: we can't test whether the model is `Sequential` via `isinstance`
       # since `Sequential` depends on `Model`.
@@ -1020,11 +1025,11 @@ class Model(Network):
     else:
       dummy_output_values = [dummy_output_values]
     self.outputs = [
-        DeferredTensor(shape=(None for _ in v.shape),
-                       dtype=v.dtype) for v in dummy_output_values]
+        base_layer.DeferredTensor(shape=(None for _ in v.shape),
+                                  dtype=v.dtype) for v in dummy_output_values]
     self.inputs = [
-        DeferredTensor(shape=(None for _ in v.shape),
-                       dtype=v.dtype) for v in dummy_input_values]
+        base_layer.DeferredTensor(shape=(None for _ in v.shape),
+                                  dtype=v.dtype) for v in dummy_input_values]
     self.input_names = [
         'input_%d' % (i + 1) for i in range(len(dummy_input_values))]
     self.output_names = [
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index 86f7e20bec..8fb957da43 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -56,8 +56,8 @@ class SimpleTestModel(keras.Model):
     if self.use_bn:
       self.bn = keras.layers.BatchNormalization(axis=-1)
 
-  def call(self, inputs):
-    x = self.dense1(inputs)
+  def call(self, x):
+    x = self.dense1(x)
     if self.use_dp:
       x = self.dp(x)
     if self.use_bn:
-- 
GitLab


From 8d1d8c1b436b84eeaede95c6ed53308a8a97cb08 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Thu, 31 May 2018 19:23:17 -0700
Subject: [PATCH 147/610] Disable
 tensorflow/contrib/stat_summarizer:stat_summarizer_test from continuous build
 due to flakiness.

PiperOrigin-RevId: 198817129
---
 tensorflow/contrib/stat_summarizer/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/stat_summarizer/BUILD b/tensorflow/contrib/stat_summarizer/BUILD
index 30be14c10c..0b8fc0cdc6 100644
--- a/tensorflow/contrib/stat_summarizer/BUILD
+++ b/tensorflow/contrib/stat_summarizer/BUILD
@@ -31,5 +31,8 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:variables",
     ],
-    tags = ["no_windows"],
+    tags = [
+        "no_windows",
+        "notap",  # TODO(b/80546574): test is flaky
+    ],
 )
-- 
GitLab


From 19ab879e55e7e41923f7999d2f12793d849b24d0 Mon Sep 17 00:00:00 2001
From: Pete Warden <pete@petewarden.com>
Date: Thu, 31 May 2018 19:44:05 -0700
Subject: [PATCH 148/610] Manual roll back of PR #19443, because it causes the
 Raspberry Pi build to fail (#19678)

---
 tensorflow/core/platform/default/build_config.bzl | 5 +----
 tensorflow/tensorflow.bzl                         | 4 ++--
 tensorflow/tools/api/generator/BUILD              | 8 +-------
 3 files changed, 4 insertions(+), 13 deletions(-)

diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 365f12196f..b9eb3d02c5 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -73,10 +73,7 @@ def pyx_library(
         outs = [filename.split(".")[0] + ".cpp"],
         # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
         # works. Windows has issues with cython_binary so skip PYTHON_BIN_PATH.
-        cmd = "PYTHONHASHSEED=0 " + select({
-            "@bazel_tools//src/conditions:windows": "",
-            "//conditions:default": "$${PYTHON_BIN_PATH} ",
-        }) + "$(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
+        cmd = "PYTHONHASHSEED=0 $(location @cython//:cython_binary) --cplus $(SRCS) --output-file $(OUTS)",
         tools = ["@cython//:cython_binary"] + pxd_srcs,
     )
 
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 2354b7021f..b59f8e1f98 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -1710,7 +1710,7 @@ def tf_version_info_genrule():
       ],
       outs=["util/version_info.cc"],
       cmd=
-      "$${PYTHON_BIN_PATH} $(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
+      "$(location //tensorflow/tools/git:gen_git_source.py) --generate $(SRCS) \"$@\" --git_tag_override=$${GIT_TAG_OVERRIDE:-}",
       local=1,
       tools=[clean_dep("//tensorflow/tools/git:gen_git_source.py")],)
 
@@ -1719,7 +1719,7 @@ def tf_py_build_info_genrule():
       name="py_build_info_gen",
       outs=["platform/build_info.py"],
       cmd=
-      "$${PYTHON_BIN_PATH} $(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
+     "$(location //tensorflow/tools/build_info:gen_build_info.py) --raw_generate \"$@\" --build_config " + if_cuda("cuda", "cpu"),
       local=1,
       tools=[clean_dep("//tensorflow/tools/build_info:gen_build_info.py")],)
 
diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 3259406858..f46bb4b5fc 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -122,13 +122,7 @@ genrule(
         "api/user_ops/__init__.py",
         # END GENERATED FILES
     ],
-    # Optionally use PYTHON_BIN_PATH on Linux platforms so that python 3
-    # works. Windows has issues with the command so skip PYTHON_BIN_PATH
-    # for now.
-    cmd = select({
-        "@bazel_tools//src/conditions:windows": "",
-        "//conditions:default": "$${PYTHON_BIN_PATH} ",
-    }) + "$(location create_python_api) $(OUTS)",
+    cmd = "$(location create_python_api) $(OUTS)",
     tools = ["create_python_api"],
 )
 
-- 
GitLab


From ae3456402ca15309a2fcb85adbaa8b464ca2d065 Mon Sep 17 00:00:00 2001
From: Felix Abecassis <felix.abecassis@gmail.com>
Date: Fri, 1 Jun 2018 04:45:15 +0200
Subject: [PATCH 149/610] docker: update cuDNN to 7.1.4.18 (#19636)

Signed-off-by: Felix Abecassis <fabecassis@nvidia.com>
---
 tensorflow/tools/docker/Dockerfile.devel-gpu | 4 ++--
 tensorflow/tools/docker/Dockerfile.gpu       | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index 2fe47f3356..e4dcce9cdd 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -13,8 +13,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusparse-dev-9-0 \
         curl \
         git \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
-        libcudnn7-dev=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
+        libcudnn7-dev=7.1.4.18-1+cuda9.0 \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
diff --git a/tensorflow/tools/docker/Dockerfile.gpu b/tensorflow/tools/docker/Dockerfile.gpu
index bff4a20392..9197651ff4 100644
--- a/tensorflow/tools/docker/Dockerfile.gpu
+++ b/tensorflow/tools/docker/Dockerfile.gpu
@@ -12,7 +12,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
         curl \
-        libcudnn7=7.0.5.15-1+cuda9.0 \
+        libcudnn7=7.1.4.18-1+cuda9.0 \
         libfreetype6-dev \
         libhdf5-serial-dev \
         libpng12-dev \
-- 
GitLab


From 3d199b64300dcc736b51d7c57cb21837da4d191b Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 31 May 2018 19:46:48 -0700
Subject: [PATCH 150/610] Fix sanity issues.

---
 tensorflow/tools/api/generator/BUILD                | 1 -
 tensorflow/tools/api/generator/create_python_api.py | 6 +-----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index 5a9eb44b32..f0c5877a90 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -24,4 +24,3 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
-
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 4f3ca06539..9f210ad42b 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -296,18 +296,14 @@ def create_api_files(
       continue
     contents = ''
     if module or not root_init_template:
-      contents = _GENERATED_FILE_HEADER + text
+      contents = _GENERATED_FILE_HEADER + text + _GENERATED_FILE_FOOTER
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
         contents = root_init_template_file.read()
         contents = contents.replace('# API IMPORTS PLACEHOLDER', text)
     with open(module_name_to_file_path[module], 'w') as fp:
-<<<<<<< HEAD
-      fp.write(_GENERATED_FILE_HEADER + text + _GENERATED_FILE_FOOTER)
-=======
       fp.write(contents)
->>>>>>> 2e272dbca6600991599e55a7ff7cfa668b8403aa
 
   if missing_output_files:
     raise ValueError(
-- 
GitLab


From 21d4931fd05eeab82250b256854deb20185a41d1 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 31 May 2018 20:44:41 -0700
Subject: [PATCH 151/610] Add new line to make buildifier happy

---
 tensorflow/tools/api/generator/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index a6b9ea7c7c..f0c5877a90 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -23,4 +23,4 @@ py_test(
         ":create_python_api",
         "//tensorflow/python:client_testlib",
     ],
-)
\ No newline at end of file
+)
-- 
GitLab


From 1039ff9ee8c8c7ed09f9bb106131a50285866dd4 Mon Sep 17 00:00:00 2001
From: Jason Zaman <jasonzaman@gmail.com>
Date: Fri, 1 Jun 2018 11:52:17 +0800
Subject: [PATCH 152/610] BUILD: dont force stripping (#19599)

* BUILD: dont force stripping

Build systems must not strip binaries, it makes it impossible for
distros to ship debugging symbols for packages.

bazel build has a --strip option to allow the user to generate stripped
binaries in a configurable way, that should be used instead.

https://fedoraproject.org/wiki/Packaging:Debuginfo
https://wiki.gentoo.org/wiki/Project:Quality_Assurance/Backtraces#Stripping

Signed-off-by: Jason Zaman <jason@perfinion.com>

* configure: add --strip=always to bazelrc
---
 configure.py     | 5 +++++
 tensorflow/BUILD | 4 +---
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/configure.py b/configure.py
index b6c32543cf..96caa2e2dd 100644
--- a/configure.py
+++ b/configure.py
@@ -1427,6 +1427,10 @@ def set_grpc_build_flags():
   write_to_bazelrc('build --define grpc_no_ares=true')
 
 
+def set_build_strip_flag():
+  write_to_bazelrc('build --strip=always')
+
+
 def set_windows_build_flags():
   if is_windows():
     # The non-monolithic build is not supported yet
@@ -1549,6 +1553,7 @@ def main():
 
   set_grpc_build_flags()
   set_cc_opt_flags(environ_cp)
+  set_build_strip_flag()
   set_windows_build_flags()
 
   if workspace_has_any_android_rule():
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index f2ad16fa04..f4351f9dce 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -471,7 +471,7 @@ tf_cc_shared_object(
 # excludes all but a subset of function names.
 # On MacOS, the linker does not support version_script, but has an
 # an "-exported_symbols_list" command.  -z defs disallows undefined
-# symbols in object files and -s strips the output.
+# symbols in object files.
 
 tf_cc_shared_object(
     name = "libtensorflow.so",
@@ -485,7 +485,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow/c:version_script.lds)",
         ],
@@ -511,7 +510,6 @@ tf_cc_shared_object(
         "//tensorflow:windows_msvc": [],
         "//conditions:default": [
             "-z defs",
-            "-s",
             "-Wl,--version-script",  #  This line must be directly followed by the version_script.lds file
             "$(location //tensorflow:tf_version_script.lds)",
         ],
-- 
GitLab


From 54b20c4be0372fb14ec9a289e4d7de7f67c03ff6 Mon Sep 17 00:00:00 2001
From: Rohan Jain <rohanj@google.com>
Date: Thu, 31 May 2018 20:54:27 -0700
Subject: [PATCH 153/610] Making sure that weight_collections are respected for
 shared_embedding_columns

PiperOrigin-RevId: 198823349
---
 .../python/feature_column/feature_column.py   | 11 ++++
 .../feature_column/feature_column_test.py     | 66 +++++++++++++++++++
 2 files changed, 77 insertions(+)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 7aa46af828..59801efc26 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1799,6 +1799,15 @@ class _EmbeddingColumnLayer(base.Layer):
     self._initializer = initializer
     self._weight_collections = weight_collections
 
+  def set_weight_collections(self, weight_collections):
+    """Sets the weight collections for the layer.
+
+    Args:
+      weight_collections: A list of collection names to which the Variable will
+        be added.
+    """
+    self._weight_collections = weight_collections
+
   def build(self, _):
     self._embedding_weight_var = self.add_variable(
         name='embedding_weights',
@@ -2604,6 +2613,7 @@ class _SharedEmbeddingColumn(
       sparse_ids = sparse_tensors.id_tensor
       sparse_weights = sparse_tensors.weight_tensor
 
+      self._layer.set_weight_collections(weight_collections)
       embedding_weights = self._layer(
           None, scope=variable_scope.get_variable_scope())
       # If we're in graph mode and this is called with a different graph,
@@ -2612,6 +2622,7 @@ class _SharedEmbeddingColumn(
           ops.get_default_graph() !=
           _get_graph_for_variable(embedding_weights)):
         self._reset_config()
+        self._layer.set_weight_collections(weight_collections)
         embedding_weights = self._layer(
             None, scope=variable_scope.get_variable_scope())
 
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 0af7b9baa9..627430d6bc 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -5615,6 +5615,72 @@ class SharedEmbeddingColumnTest(test.TestCase):
       self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
       self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
+  def test_get_dense_tensor_weight_collections(self):
+    # Inputs.
+    vocabulary_size = 3
+    # -1 values are ignored.
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
+
+    # Embedding variable.
+    embedding_dimension = 2
+    embedding_values = (
+        (1., 2.),  # id 0
+        (3., 5.),  # id 1
+        (7., 11.)  # id 2
+    )
+
+    def _initializer(shape, dtype, partition_info):
+      self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
+      self.assertEqual(dtypes.float32, dtype)
+      self.assertIsNone(partition_info)
+      return embedding_values
+
+    # Expected lookup result, using combiner='mean'.
+    expected_lookups_a = (
+        # example 0:
+        (7., 11.),  # ids [2], embedding = [7, 11]
+        # example 1:
+        (2., 3.5),  # ids [0, 1], embedding = mean([1, 2] + [3, 5]) = [2, 3.5]
+    )
+    expected_lookups_b = (
+        # example 0:
+        (1., 2.),  # ids [0], embedding = [1, 2]
+        # example 1:
+        (0., 0.),  # ids [], embedding = [0, 0]
+    )
+
+    # Build columns.
+    categorical_column_a = fc.categorical_column_with_identity(
+        key='aaa', num_buckets=vocabulary_size)
+    categorical_column_b = fc.categorical_column_with_identity(
+        key='bbb', num_buckets=vocabulary_size)
+    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+        [categorical_column_a, categorical_column_b],
+        dimension=embedding_dimension,
+        initializer=_initializer)
+
+    fc.input_layer(
+        input_features, [embedding_column_a, embedding_column_b],
+        weight_collections=('my_vars',))
+
+    # Assert expected embedding variable and lookups.
+    global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
+    self.assertItemsEqual(
+        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        tuple(v.name for v in global_vars))
+    my_vars = ops.get_collection('my_vars')
+    self.assertItemsEqual(
+        ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
+        tuple(v.name for v in my_vars))
+
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
-- 
GitLab


From 1acaca5c2b033f2d51f7d2e97da0511b04420f1d Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 31 May 2018 21:55:11 -0700
Subject: [PATCH 154/610] Potential fix to layout_optimizer_test.py

---
 tensorflow/python/grappler/layout_optimizer_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 2d6925d1a8..af5d709f7e 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -1389,7 +1389,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 3
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testLoopWithVecAnd4D(self):
@@ -1413,7 +1413,7 @@ class LayoutOptimizerTest(test.TestCase):
       expected_num_transposes = 2
       self.assertEqual(expected_num_transposes, num_transposes)
       self._assert_trans_nhwc_to_nchw('map/while/Conv2D-0', nodes)
-      self._assert_trans_nchw_to_nhwc('map/while/Add-0-2', nodes)
+      self._assert_trans_nchw_to_nhwc('map/while/Add_1-0-2', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
   def testBinaryOpSecondPort(self):
-- 
GitLab


From 8f79ab773fe44e4779138a77a3bda4b18245d658 Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 31 May 2018 22:55:46 -0700
Subject: [PATCH 155/610] Fix import depth issue.

---
 tensorflow/contrib/lite/python/lite.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0fc7958d41..d595415b63 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -33,7 +33,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import six
+from six import PY3
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
@@ -192,7 +192,7 @@ class TocoConverter(object):
           print("Ignore 'tcmalloc: large alloc' warnings.")
 
           if not isinstance(file_content, str):
-            if six.PY3:
+            if PY3:
               file_content = file_content.decode('utf-8')
             else:
               file_content = file_content.encode('utf-8')
-- 
GitLab


From 961a39346d8be33cff473f1e81498b887c155070 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 00:18:19 -0700
Subject: [PATCH 156/610] Unify error handling in CudnnSupport.

PiperOrigin-RevId: 198836479
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc  | 2902 ++++++++----------
 tensorflow/stream_executor/cuda/cuda_dnn.h   |  128 +-
 tensorflow/stream_executor/cuda/cuda_timer.h |    3 +-
 tensorflow/stream_executor/dnn.cc            |    4 +
 tensorflow/stream_executor/dnn.h             |    5 +-
 5 files changed, 1354 insertions(+), 1688 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index c2c0c283b3..55c1083a61 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <utility>
 
 #include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/core/lib/core/errors.h"
@@ -55,6 +56,33 @@ namespace {
 
 static_assert(CUDNN_VERSION >= 6000, "cuDNN needs to be version 6.0 or higher");
 
+// Exits the program if 'expr' doesn't return CUDNN_STATUS_SUCCESS.
+#define CHECK_CUDNN_OK(expr) CHECK_EQ(expr, CUDNN_STATUS_SUCCESS)
+
+// If 'expr' doesn't return CUDNN_STATUS_SUCCESS, returns from the current
+// function with a non-successful port::Status.
+#define RETURN_IF_CUDNN_ERROR(expr)                                      \
+  do {                                                                   \
+    cudnnStatus_t _status = expr;                                        \
+    if (!SE_PREDICT_TRUE(_status == CUDNN_STATUS_SUCCESS)) {             \
+      std::ostringstream oss;                                            \
+      oss << ToString(_status) << "\nin " << __FILE__ << "(" << __LINE__ \
+          << "): '" << #expr << "'";                                     \
+      return port::Status(port::error::UNKNOWN, oss.str().c_str());      \
+    }                                                                    \
+  } while (false)
+
+// Returns whether status is 'ok', and potentially logs the error.
+bool IsStatusOk(const port::Status& status, bool report_error) {
+  if (status.ok()) {
+    return true;
+  }
+  if (report_error) {
+    LOG(ERROR) << status.error_message();
+  }
+  return false;
+}
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -89,26 +117,20 @@ string ToString(cudnnStatus_t status) {
       return "CUDNN_STATUS_NOT_SUPPORTED";
     case CUDNN_STATUS_LICENSE_ERROR:
       return "CUDNN_STATUS_LICENSE_ERROR";
+    case CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING:
+      return "CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING";
+#if CUDNN_VERSION >= 7000
+    case CUDNN_STATUS_RUNTIME_IN_PROGRESS:
+      return "CUDNN_STATUS_RUNTIME_IN_PROGRESS";
+    case CUDNN_STATUS_RUNTIME_FP_OVERFLOW:
+      return "CUDNN_STATUS_RUNTIME_FP_OVERFLOW";
+#endif
     default:
       return port::StrCat("<unknown cudnn status: ", static_cast<int>(status),
                           ">");
   }
 }
 
-string ToString(libraryPropertyType type) {
-  switch (type) {
-    case MAJOR_VERSION:
-      return "MAJOR_VERSION";
-    case MINOR_VERSION:
-      return "MINOR_VERSION";
-    case PATCH_LEVEL:
-      return "PATCH_LEVEL";
-    default:
-      return port::StrCat(
-          "<unknown libraryPropertyType: ", static_cast<int>(type), ">");
-  }
-}
-
 template <typename T>
 cudnnDataType_t GetCudnnDataType();
 
@@ -150,9 +172,9 @@ class CudnnHandle {
 
 }  // namespace
 
-// Wraps a cuDNN handle and provides access to it through CudnnHandle instances,
-// which also locks a mutex, acquires the CUDA context, and sets the stream
-// that cuDNN should use to enqueue any work.
+// Wraps a cuDNN handle and provides access to it through CudnnHandle
+// instances, which also locks a mutex, acquires the CUDA context, and sets
+// the stream that cuDNN should use to enqueue any work.
 //
 // Note: CudnnSupport::cudnn_ should be the only instantiation of this class.
 class CudnnAccess {
@@ -167,13 +189,13 @@ class CudnnAccess {
 
   // Creates a CudnnHandle instance for stream.
   //
-  // cuDNN API calls using the same handle instance need to be serialized across
-  // threads. This is guaranteed by CudnnHandle instances locking the mutex
-  // owned by this class.
+  // cuDNN API calls using the same handle instance need to be serialized
+  // across threads. This is guaranteed by CudnnHandle instances locking the
+  // mutex owned by this class.
   //
   // Most cuDNN APIs taking a handle perform work on a CUDA stream. The
-  // CudnnHandle instance acquires the executor's CUDA context and sets cuDNN to
-  // use the provided stream.
+  // CudnnHandle instance acquires the executor's CUDA context and sets cuDNN
+  // to use the provided stream.
   //
   // The stream argument may be null, which translates to the legacy default
   // stream. See
@@ -187,7 +209,6 @@ class CudnnAccess {
     CUstream cu_stream = stream ? AsCUDAStreamValue(stream) : cudaStreamLegacy;
     auto status = cudnnSetStream(handle_, cu_stream);
     CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Failed to set cuDNN stream.";
-    using my_mutex_lock = mutex_lock;
     return CudnnHandle(std::move(context), std::move(lock), handle_);
   }
 
@@ -201,6 +222,8 @@ class CudnnAccess {
 
 namespace {
 
+// A helper function to return the internal compute type for
+// RNNs in cudnn.
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type);
 
 cudnnConvolutionFwdAlgo_t ToConvForwardAlgo(dnn::AlgorithmDesc algorithm) {
@@ -264,16 +287,10 @@ cudnnConvolutionBwdFilterAlgo_t ToConvBackwardFilterAlgo(
   }
 }
 
-port::Status GetCudnnProperty(libraryPropertyType type, int* value) {
-  cudnnStatus_t status = cudnnGetProperty(type, value);
-  if (status != CUDNN_STATUS_SUCCESS) {
-    const string error =
-        port::StrCat("cudnnGetProperty failed for type: ", ToString(type),
-                     " with status: ", ToString(status));
-    LOG(ERROR) << error;
-    return port::Status(port::error::INTERNAL, error);
-  }
-  return port::Status::OK();
+port::StatusOr<int> GetCudnnProperty(libraryPropertyType type) {
+  int value;
+  RETURN_IF_CUDNN_ERROR(cudnnGetProperty(type, &value));
+  return value;
 }
 
 cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
@@ -294,9 +311,9 @@ cudnnRNNAlgo_t ToCudnnRNNAlgo(const dnn::AlgorithmDesc& algorithm) {
 }
 
 port::Status GetLoadedCudnnVersion(CudnnVersion* version) {
-  TF_RETURN_IF_ERROR(GetCudnnProperty(MAJOR_VERSION, &version->major_version));
-  TF_RETURN_IF_ERROR(GetCudnnProperty(MINOR_VERSION, &version->minor_version));
-  TF_RETURN_IF_ERROR(GetCudnnProperty(PATCH_LEVEL, &version->patch_level));
+  SE_ASSIGN_OR_RETURN(version->major_version, GetCudnnProperty(MAJOR_VERSION));
+  SE_ASSIGN_OR_RETURN(version->minor_version, GetCudnnProperty(MINOR_VERSION));
+  SE_ASSIGN_OR_RETURN(version->patch_level, GetCudnnProperty(PATCH_LEVEL));
   return port::Status::OK();
 }
 
@@ -319,9 +336,11 @@ port::Status CudnnSupport::Init() {
           ".  CuDNN library major and minor version needs to match or have "
           "higher minor version in case of CuDNN 7.0 or later version. If "
           "using a binary install, upgrade your CuDNN library.  If building "
-          "from sources, make sure the library loaded at runtime is compatible "
+          "from sources, make sure the library loaded at runtime is "
+          "compatible "
           "with the version specified during compile configuration.");
       LOG(ERROR) << error;
+      cudnnDestroy(cudnn_handle);
       return port::Status(port::error::INTERNAL, error);
     }
 
@@ -329,23 +348,17 @@ port::Status CudnnSupport::Init() {
     return port::Status::OK();
   }
 
-  LOG(ERROR) << "could not create cudnn handle: " << ToString(status);
+  CHECK_EQ(cudnn_handle, nullptr);
+  LOG(ERROR) << "Could not create cudnn handle: " << ToString(status);
   if (status == CUDNN_STATUS_NOT_INITIALIZED) {
     auto result = cuda::Diagnostician::FindKernelDriverVersion();
     if (!result.ok()) {
-      LOG(ERROR) << "error retrieving driver version: "
+      LOG(ERROR) << "Error retrieving driver version: "
                  << DriverVersionStatusToString(result);
     } else {
       const auto& version = result.ValueOrDie();
-      LOG(ERROR) << "possibly insufficient driver version: "
+      LOG(ERROR) << "Possibly insufficient driver version: "
                  << DriverVersionToString(version);
-      // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__)
-      if (std::get<0>(version) < 340) {
-        LOG(ERROR)
-            << "cudnn library is only supported on 340.XX+ driver versions";
-      }
-#endif
     }
   }
 
@@ -364,18 +377,129 @@ CudnnSupport::GetVersion() {
 
 namespace {
 
-// Turns a BatchDescriptor structure into a cudnn tensor handle within a scope.
+// Deleter functors for cuDNN types that need to be deleted.
+struct TensorDescriptorDeleter {
+  void operator()(cudnnTensorDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyTensorDescriptor(descriptor));
+  }
+};
+struct FilterDescriptorDeleter {
+  void operator()(cudnnFilterDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyFilterDescriptor(descriptor));
+  }
+};
+struct ConvolutionDescriptorDeleter {
+  void operator()(cudnnConvolutionDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyConvolutionDescriptor(descriptor));
+  }
+};
+struct PoolingDescriptorDeleter {
+  void operator()(cudnnPoolingDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyPoolingDescriptor(descriptor));
+  }
+};
+struct LrnDescriptorDeleter {
+  void operator()(cudnnLRNDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyLRNDescriptor(descriptor));
+  }
+};
+
+struct ActivationDescriptorDeleter {
+  void operator()(cudnnActivationDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyActivationDescriptor(descriptor));
+  }
+};
+struct DropoutDescriptorDeleter {
+  void operator()(cudnnDropoutDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyDropoutDescriptor(descriptor));
+  }
+};
+struct RnnDescriptorDeleter {
+  void operator()(cudnnRNNDescriptor_t descriptor) const {
+    CHECK_CUDNN_OK(cudnnDestroyRNNDescriptor(descriptor));
+  }
+};
+struct PersistentRnnPlanDeleter {
+  void operator()(cudnnPersistentRNNPlan_t plan) const {
+    CHECK_CUDNN_OK(cudnnDestroyPersistentRNNPlan(plan));
+  }
+};
+
+// RAII wrappers for cuDNN types.
+using TensorDescriptor =
+    std::unique_ptr<cudnnTensorStruct, TensorDescriptorDeleter>;
+using FilterDescriptor =
+    std::unique_ptr<cudnnFilterStruct, FilterDescriptorDeleter>;
+using ConvolutionDescriptor =
+    std::unique_ptr<cudnnConvolutionStruct, ConvolutionDescriptorDeleter>;
+using PoolingDescriptor =
+    std::unique_ptr<cudnnPoolingStruct, PoolingDescriptorDeleter>;
+using LrnDescriptor = std::unique_ptr<cudnnLRNStruct, LrnDescriptorDeleter>;
+using ActivationDescriptor =
+    std::unique_ptr<cudnnActivationStruct, ActivationDescriptorDeleter>;
+using DropoutDescriptor =
+    std::unique_ptr<cudnnDropoutStruct, DropoutDescriptorDeleter>;
+using RnnDescriptor = std::unique_ptr<cudnnRNNStruct, RnnDescriptorDeleter>;
+using PersistentRnnPlan =
+    std::unique_ptr<cudnnPersistentRNNPlan, PersistentRnnPlanDeleter>;
+
+// Factory methods for cuDNN types.
+TensorDescriptor CreateTensorDescriptor() {
+  cudnnTensorDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateTensorDescriptor(&result));
+  return TensorDescriptor(result);
+}
+FilterDescriptor CreateFilterDescriptor() {
+  cudnnFilterDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateFilterDescriptor(&result));
+  return FilterDescriptor(result);
+}
+ConvolutionDescriptor CreateConvolutionDescriptor() {
+  cudnnConvolutionDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateConvolutionDescriptor(&result));
+  return ConvolutionDescriptor(result);
+}
+PoolingDescriptor CreatePoolingDescriptor() {
+  cudnnPoolingDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreatePoolingDescriptor(&result));
+  return PoolingDescriptor(result);
+}
+LrnDescriptor CreateLrnDescriptor() {
+  cudnnLRNDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateLRNDescriptor(&result));
+  return LrnDescriptor(result);
+}
+ActivationDescriptor CreateActivationDescriptor() {
+  cudnnActivationDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateActivationDescriptor(&result));
+  return ActivationDescriptor(result);
+}
+DropoutDescriptor CreateDropoutDescriptor() {
+  cudnnDropoutDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateDropoutDescriptor(&result));
+  return DropoutDescriptor(result);
+}
+RnnDescriptor CreateRnnDescriptor() {
+  cudnnRNNDescriptor_t result;
+  CHECK_CUDNN_OK(cudnnCreateRNNDescriptor(&result));
+  return RnnDescriptor(result);
+}
+PersistentRnnPlan CreatePersistentRnnPlan(cudnnRNNDescriptor_t rnn_desc,
+                                          int batch_size,
+                                          cudnnDataType_t data_type) {
+  cudnnPersistentRNNPlan_t result;
+  CHECK_CUDNN_OK(
+      cudnnCreatePersistentRNNPlan(rnn_desc, batch_size, data_type, &result));
+  return PersistentRnnPlan(result);
+}
+
+// Turns a BatchDescriptor structure into a cudnn tensor handle within a
+// scope.
 class ScopedTensorDescriptor {
  public:
   ScopedTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
                          cudnnDataType_t elem_type)
-      : handle_(nullptr) {
-    cudnnStatus_t status = cudnnCreateTensorDescriptor(&handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn tensor descriptor: "
-                 << ToString(status);
-    }
-
+      : handle_(CreateTensorDescriptor()) {
     switch (batch_descriptor.layout()) {
       case dnn::DataLayout::kBatchYXDepth:
       case dnn::DataLayout::kBatchDepthYX: {
@@ -393,25 +517,16 @@ class ScopedTensorDescriptor {
                        &CheckedNarrowing<int64, int>);
         std::transform(dims64.cbegin(), dims64.cend(), dims.begin(),
                        &CheckedNarrowing<int64, int>);
-        status = cudnnSetTensorNdDescriptor(handle_, elem_type, nd, dims.data(),
-                                            strides.data());
-
-        if (status != CUDNN_STATUS_SUCCESS) {
-          LOG(FATAL) << "could not convert BatchDescriptor "
-                     << batch_descriptor.ToString()
-                     << " to cudnn tensor descriptor: " << ToString(status);
-        }
+        CHECK_CUDNN_OK(cudnnSetTensorNdDescriptor(handle_.get(), elem_type, nd,
+                                                  dims.data(), strides.data()))
+            << "batch_descriptor: " << batch_descriptor.ToString();
       } break;
       case dnn::DataLayout::kBatchDepthYX4: {
-        status = cudnnSetTensor4dDescriptor(
-            handle_, CUDNN_TENSOR_NCHW_VECT_C, elem_type,
+        CHECK_CUDNN_OK(cudnnSetTensor4dDescriptor(
+            handle_.get(), CUDNN_TENSOR_NCHW_VECT_C, elem_type,
             batch_descriptor.count(), batch_descriptor.feature_map_count(),
-            batch_descriptor.height(), batch_descriptor.width());
-        if (status != CUDNN_STATUS_SUCCESS) {
-          LOG(FATAL) << "could not convert BatchDescriptor "
-                     << batch_descriptor.ToString()
-                     << " to cudnn tensor descriptor: " << ToString(status);
-        }
+            batch_descriptor.height(), batch_descriptor.width()))
+            << "batch_descriptor: " << batch_descriptor.ToString();
       } break;
       default:
         LOG(FATAL) << "Unsupported tensor format "
@@ -420,37 +535,24 @@ class ScopedTensorDescriptor {
     }
   }
 
-  ~ScopedTensorDescriptor() {
-    cudnnStatus_t status = cudnnDestroyTensorDescriptor(handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn tensor descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  cudnnTensorDescriptor_t handle() const { return handle_; }
+  cudnnTensorDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnTensorDescriptor_t handle_;  // Owned.
+  TensorDescriptor handle_;
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
 };
 
-// Turns a FilterDescriptor structure into a cudnn filter handle within a scope.
+// Turns a FilterDescriptor structure into a cudnn filter handle within a
+// scope.
 class ScopedFilterDescriptor {
  public:
   ScopedFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
                          cudnnDataType_t elem_type)
-      : handle_(nullptr) {
-    cudnnStatus_t status = cudnnCreateFilterDescriptor(&handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn filter descriptor: "
-                 << ToString(status);
-    }
-
+      : handle_(CreateFilterDescriptor()) {
     // TODO(b/23032134): Even if the filter layout is not supported,
-    // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because it
-    // does not take layout as an input. Maybe force cuDNN by giving wrong
+    // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because
+    // it does not take layout as an input. Maybe force cuDNN by giving wrong
     // inputs intentionally?
     cudnnTensorFormat_t format;
     switch (filter_descriptor.layout()) {
@@ -475,32 +577,20 @@ class ScopedFilterDescriptor {
     const auto& spatial_dims = filter_descriptor.input_filter_dims();
     std::copy(spatial_dims.begin(), spatial_dims.end(), dims.begin() + 2);
 
-    status = cudnnSetFilterNdDescriptor(handle_, elem_type, format, dims.size(),
-                                        dims.data());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn filter descriptor: "
-                 << ToString(status);
-    }
+    CHECK_CUDNN_OK(cudnnSetFilterNdDescriptor(handle_.get(), elem_type, format,
+                                              dims.size(), dims.data()));
   }
 
-  ~ScopedFilterDescriptor() {
-    cudnnStatus_t status = cudnnDestroyFilterDescriptor(handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn filter descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  cudnnFilterDescriptor_t handle() const { return handle_; }
+  cudnnFilterDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnFilterDescriptor_t handle_;  // Owned.
+  FilterDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
 };
 
 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
-static bool TensorOpMathEnabled() {
+bool TensorOpMathEnabled() {
   static bool is_enabled = [] {
     bool is_disabled = false;
     TF_CHECK_OK(
@@ -513,7 +603,7 @@ static bool TensorOpMathEnabled() {
 
 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
 // for RNNs.
-static bool RnnTensorOpMathEnabled() {
+bool RnnTensorOpMathEnabled() {
   static bool is_enabled = [] {
     bool is_disabled = false;
     TF_CHECK_OK(
@@ -524,15 +614,16 @@ static bool RnnTensorOpMathEnabled() {
   return is_enabled;
 }
 
-// A helper function to decide whether to use CUDNN_BATCHNORM_SPATIAL_PERSISTENT
-// in batchnorm. This mode can be faster in some tasks because an optimized path
-// may be selected for CUDNN_DATA_FLOAT and CUDNN_DATA_HALF data types, compute
-// capability 6.0 or higher. The reason we set it to false by default is that
-// this mode may use scaled atomic integer reduction that may cause a numerical
-// overflow for certain input data range.
+// A helper function to decide whether to use
+// CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
+// some tasks because an optimized path may be selected for CUDNN_DATA_FLOAT
+// and CUDNN_DATA_HALF data types, compute capability 6.0 or higher. The
+// reason we set it to false by default is that this mode may use scaled
+// atomic integer reduction that may cause a numerical overflow for certain
+// input data range.
 // TODO(yangzihao): Use autotune to choose between this mode and
 // CUDNN_BATCHNORM_SPATIAL mode.
-static bool BatchnormSpatialPersistentEnabled() {
+bool BatchnormSpatialPersistentEnabled() {
   static bool is_enabled = [] {
     bool is_enabled = false;
     TF_CHECK_OK(tensorflow::ReadBoolFromEnvVar(
@@ -550,19 +641,13 @@ class ScopedConvolutionDescriptor {
   ScopedConvolutionDescriptor(
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       cudnnDataType_t data_type)
-      : handle_(nullptr) {
-    cudnnStatus_t status = cudnnCreateConvolutionDescriptor(&handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn convolution descriptor: "
-                 << ToString(status);
-    }
+      : handle_(CreateConvolutionDescriptor()) {
     const auto& strides64 = convolution_descriptor.strides();
     const auto& padding64 = convolution_descriptor.padding();
     const auto& dilations64 = convolution_descriptor.dilations();
-    if (convolution_descriptor.pad_alignment() ==
-        dnn::PadAlignment::kTensorFlowPadding) {
-      LOG(ERROR) << "TensorFlow padding alignment is not supported.";
-    }
+    CHECK_NE(convolution_descriptor.pad_alignment(),
+             dnn::PadAlignment::kTensorFlowPadding)
+        << "TensorFlow padding alignment is not supported.";
 
     // cuDNN requires arrays of ints.
     std::vector<int> strides(convolution_descriptor.ndims());
@@ -577,18 +662,14 @@ class ScopedConvolutionDescriptor {
     std::transform(dilations64.cbegin(), dilations64.cend(), dilations.begin(),
                    &CheckedNarrowing<int64, int>);
 
-    status = cudnnSetConvolutionNdDescriptor(
-        handle_, convolution_descriptor.ndims(), padding.data(), strides.data(),
-        dilations.data(),
+    CHECK_CUDNN_OK(cudnnSetConvolutionNdDescriptor(
+        handle_.get(), convolution_descriptor.ndims(), padding.data(),
+        strides.data(), dilations.data(),
         // NOTE(keveman): cuDNN supports convolution and cross correlation.
         // However, almost all the use cases do cross correlation, so just
         // hard coding it here.
-        CUDNN_CROSS_CORRELATION, data_type);
+        CUDNN_CROSS_CORRELATION, data_type));
 
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn convolution descriptor: "
-                 << ToString(status);
-    }
     // NOTE(benbarsdell): This only applies if tensor op math is enabled
     //                      and algo selection is set to Default.
     this->set_use_tensor_op_math(true);
@@ -596,44 +677,28 @@ class ScopedConvolutionDescriptor {
 #if CUDNN_MAJOR >= 7
     VLOG(2) << "Requesting grouped convolution: "
             << convolution_descriptor.group_count();
-    status = cudnnSetConvolutionGroupCount(
-        handle_, convolution_descriptor.group_count());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn convolution group count: "
-                 << ToString(status);
-    }
+    CHECK_CUDNN_OK(cudnnSetConvolutionGroupCount(
+        handle_.get(), convolution_descriptor.group_count()));
 #else
     CHECK_EQ(convolution_descriptor.group_count(), 1)
         << "Requested grouped convolution for cuDNN version < 7";
 #endif
   }
 
-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+  void set_use_tensor_op_math(bool use_tensor_op_math) const {
 #if CUDNN_VERSION >= 7000
     cudnnMathType_t math_type =
         (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
     if (TensorOpMathEnabled()) {
-      cudnnStatus_t status = cudnnSetConvolutionMathType(handle_, math_type);
-      if (status != CUDNN_STATUS_SUCCESS) {
-        LOG(FATAL) << "could not set cudnn convolution math type: "
-                   << ToString(status);
-      }
+      CHECK_CUDNN_OK(cudnnSetConvolutionMathType(handle_.get(), math_type));
     }
 #endif
   }
 
-  ~ScopedConvolutionDescriptor() {
-    cudnnStatus_t status = cudnnDestroyConvolutionDescriptor(handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn convolution descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  cudnnConvolutionDescriptor_t handle() const { return handle_; }
+  cudnnConvolutionDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnConvolutionDescriptor_t handle_;  // Owned.
+  ConvolutionDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
 };
@@ -644,12 +709,7 @@ class ScopedPoolingDescriptor {
  public:
   explicit ScopedPoolingDescriptor(
       const dnn::PoolingDescriptor& pooling_descriptor)
-      : handle_(nullptr) {
-    cudnnStatus_t status = cudnnCreatePoolingDescriptor(&handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn pooling descriptor: "
-                 << ToString(status);
-    }
+      : handle_(CreatePoolingDescriptor()) {
     const std::vector<int64> strides64 = pooling_descriptor.strides();
     const std::vector<int64> padding64 = pooling_descriptor.padding();
     const std::vector<int64> shape64 = pooling_descriptor.window();
@@ -665,30 +725,19 @@ class ScopedPoolingDescriptor {
     std::transform(shape64.cbegin(), shape64.cend(), shape.begin(),
                    &CheckedNarrowing<int64, int>);
     bool propagate_nans = pooling_descriptor.propagate_nans();
-    status = cudnnSetPoolingNdDescriptor(
-        handle_,
+    CHECK_CUDNN_OK(cudnnSetPoolingNdDescriptor(
+        handle_.get(),
         (pooling_descriptor.mode() == dnn::PoolingMode::kMaximum
              ? CUDNN_POOLING_MAX
              : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING),
         propagate_nans ? CUDNN_PROPAGATE_NAN : CUDNN_NOT_PROPAGATE_NAN, nd,
-        shape.data(), padding.data(), strides.data());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn pooling descriptor: "
-                 << ToString(status);
-    }
-  }
-  ~ScopedPoolingDescriptor() {
-    cudnnStatus_t status = cudnnDestroyPoolingDescriptor(handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn pooling descriptor: "
-                 << ToString(status);
-    }
+        shape.data(), padding.data(), strides.data()));
   }
 
-  cudnnPoolingDescriptor_t handle() const { return handle_; }
+  cudnnPoolingDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnPoolingDescriptor_t handle_;  // Owned.
+  PoolingDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
 };
@@ -698,13 +747,7 @@ class ScopedNormalizeDescriptor {
  public:
   explicit ScopedNormalizeDescriptor(
       const dnn::NormalizeDescriptor& normalize_descriptor)
-      : handle_(nullptr) {
-    cudnnStatus_t status = cudnnCreateLRNDescriptor(&handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn LRN descriptor: "
-                 << ToString(status);
-    }
-
+      : handle_(CreateLrnDescriptor()) {
     // The range specifies that the indices in the closed range
     // [i - range, i + range] should be included in the normalization for index
     // i. The lrnN value is the total number of elements in the range, so
@@ -725,24 +768,14 @@ class ScopedNormalizeDescriptor {
 
     double lrnBeta = normalize_descriptor.beta();
     double lrnK = normalize_descriptor.bias();
-    status = cudnnSetLRNDescriptor(handle_, lrnN, lrnAlpha, lrnBeta, lrnK);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn LRN descriptor: " << ToString(status);
-    }
-  }
-
-  ~ScopedNormalizeDescriptor() {
-    cudnnStatus_t status = cudnnDestroyLRNDescriptor(handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn LRN descriptor: "
-                 << ToString(status);
-    }
+    CHECK_CUDNN_OK(
+        cudnnSetLRNDescriptor(handle_.get(), lrnN, lrnAlpha, lrnBeta, lrnK));
   }
 
-  cudnnLRNDescriptor_t handle() const { return handle_; }
+  cudnnLRNDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnLRNDescriptor_t handle_;  // Owned.
+  LrnDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
 };
@@ -754,13 +787,7 @@ class ScopedActivationDescriptor {
   ScopedActivationDescriptor(dnn::ActivationMode activation_mode,
                              cudnnNanPropagation_t nan_propagation,
                              double value_max)
-      : handle_(nullptr) {
-    cudnnStatus_t status = cudnnCreateActivationDescriptor(&handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not create cudnn activation descriptor: "
-                 << ToString(status);
-    }
-
+      : handle_(CreateActivationDescriptor()) {
     double relu_ceiling = 0.0;
     cudnnActivationMode_t mode;
     switch (activation_mode) {
@@ -786,26 +813,14 @@ class ScopedActivationDescriptor {
                    << static_cast<int>(activation_mode);
     }
 
-    status = cudnnSetActivationDescriptor(handle_, mode, nan_propagation,
-                                          relu_ceiling);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(FATAL) << "could not set cudnn activation descriptor: "
-                 << ToString(status);
-    }
-  }
-
-  ~ScopedActivationDescriptor() {
-    cudnnStatus_t status = cudnnDestroyActivationDescriptor(handle_);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "could not destroy cudnn activation descriptor: "
-                 << ToString(status);
-    }
+    CHECK_CUDNN_OK(cudnnSetActivationDescriptor(handle_.get(), mode,
+                                                nan_propagation, relu_ceiling));
   }
 
-  cudnnActivationDescriptor_t handle() const { return handle_; }
+  cudnnActivationDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnActivationDescriptor_t handle_;  // Owned.
+  ActivationDescriptor handle_;  // Owned.
 
   SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
 };
@@ -873,117 +888,74 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
   }
 }
 
-template <typename Base>
-class MixinBase : public Base {};
-template <>
-class MixinBase<void> {};
-
-#define CUDNN_RETURN_IF_FAIL(STATUS, ...)                                \
-  if (!SE_PREDICT_TRUE((STATUS) == CUDNN_STATUS_SUCCESS)) {              \
-    string error_msg = port::StrCat(ToString(STATUS), " ", __VA_ARGS__); \
-    SetFailure(port::Status(port::error::UNKNOWN, error_msg));           \
-    LOG(ERROR) << error_msg;                                             \
-    return;                                                              \
-  }
+class ScopedDropoutDescriptor {
+  explicit ScopedDropoutDescriptor(DropoutDescriptor handle)
+      : handle_(std::move(handle)) {}
 
-// TODO(csigg): Remove inheritance for code reuse.
-template <typename Base>
-class CudnnDescriptorCommon : public MixinBase<Base> {
  public:
-  bool ok() const { return status_.ok(); }
-  port::Status Status() const { return status_; }
+  ScopedDropoutDescriptor(ScopedDropoutDescriptor&&) = default;
 
- protected:
-  void SetFailure(const port::Status& status) { status_.Update(status); }
-  port::Status status_;
-};
+  static port::StatusOr<ScopedDropoutDescriptor> Create(
+      const CudnnHandle& cudnn, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) {
+    DropoutDescriptor handle = CreateDropoutDescriptor();
 
-class CudnnDropoutDescriptor : public CudnnDescriptorCommon<void> {
- public:
-  CudnnDropoutDescriptor(const CudnnHandle& cudnn, float dropout, uint64 seed,
-                         ScratchAllocator* state_allocator)
-      : handle_(nullptr) {
-    cudnnStatus_t status;
-    status = cudnnCreateDropoutDescriptor(&handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to create dropout descriptor");
-
-    if (dropout == 0.f) {
-      return;
+    if (dropout == 0.0f) {
+      // Return 'empty' dropout descriptor.
+      return ScopedDropoutDescriptor(std::move(handle));
     }
 
     DeviceMemory<uint8> state_memory;
     if (state_allocator) {
       size_t state_sizes_in_bytes = 0;
-      status = cudnnDropoutGetStatesSize(cudnn.handle(), &state_sizes_in_bytes);
-      CUDNN_RETURN_IF_FAIL(status, "Failed to query dropout state sizes");
-
-      auto allocated =
-          state_allocator->AllocateBytes(nullptr, state_sizes_in_bytes);
-      if (!allocated.ok() ||
-          (state_memory = allocated.ValueOrDie()) == nullptr) {
-        string error_msg =
-            port::StrCat("Failed to allocate Cudnn dropout state memory of ",
-                         state_sizes_in_bytes, " bytes.");
-        status_ = port::Status(port::error::UNKNOWN, error_msg);
-        LOG(ERROR) << error_msg;
-        return;
-      }
+      RETURN_IF_CUDNN_ERROR(
+          cudnnDropoutGetStatesSize(cudnn.handle(), &state_sizes_in_bytes));
+      SE_ASSIGN_OR_RETURN(state_memory, state_allocator->AllocateBytes(
+                                            nullptr, state_sizes_in_bytes));
     }
-    status = cudnnSetDropoutDescriptor(handle_, cudnn.handle(), dropout,
-                                       state_memory.opaque(),
-                                       state_memory.size(), seed);
-    CUDNN_RETURN_IF_FAIL(
-        status, port::StrCat(
-                    "Failed to set dropout descriptor with state memory size: ",
-                    state_memory.size(), " bytes."));
-  }
+    RETURN_IF_CUDNN_ERROR(cudnnSetDropoutDescriptor(
+        handle.get(), cudnn.handle(), dropout, state_memory.opaque(),
+        state_memory.size(), seed));
 
-  ~CudnnDropoutDescriptor() {
-    cudnnStatus_t status = cudnnDestroyDropoutDescriptor(handle_);
-    // TODO(csigg): This is a no-op (error is not reported). Same below.
-    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy Cudnn dropout handle: ");
+    return ScopedDropoutDescriptor(std::move(handle));
   }
 
-  cudnnDropoutDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return handle_;
-  }
+  cudnnDropoutDescriptor_t handle() const { return handle_.get(); }
 
  private:
-  cudnnDropoutDescriptor_t handle_;  // Owned.
-  float dropout_;
-  uint64 seed_;
-  SE_DISALLOW_COPY_AND_ASSIGN(CudnnDropoutDescriptor);
+  DropoutDescriptor handle_;  // Owned.
+  SE_DISALLOW_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
 };
 
-class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
- public:
-  typedef dnn::RnnDescriptor::ParamsRegion ParamsRegion;
+class CudnnRnnParamsDescriptor {
   typedef dnn::RnnDescriptor::ParamsRegions ParamsRegions;
-  CudnnRnnParamsDescriptor(const CudnnHandle& cudnn,
-                           const CudnnRnnDescriptor& rnn_desc);
-  ~CudnnRnnParamsDescriptor() {
-    cudnnStatus_t status = cudnnDestroyFilterDescriptor(handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to destroy RNN filter descriptor");
-  }
-  cudnnFilterDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return handle_;
-  }
+
+  CudnnRnnParamsDescriptor(FilterDescriptor handle, int64 params_size_in_bytes,
+                           ParamsRegions weights, ParamsRegions biases)
+      : handle_(std::move(handle)),
+        params_size_in_bytes_(params_size_in_bytes),
+        weights_(std::move(weights)),
+        biases_(std::move(biases)) {}
+
+ public:
+  CudnnRnnParamsDescriptor(CudnnRnnParamsDescriptor&&) = default;
+
+  static port::StatusOr<CudnnRnnParamsDescriptor> Create(
+      const CudnnHandle& cudnn, int input_size, cudnnDataType_t data_type,
+      cudnnRNNDescriptor_t rnn_desc, cudnnRNNMode_t rnn_mode,
+      cudnnDirectionMode_t direction_mode, int num_layers);
+
+  cudnnFilterDescriptor_t handle() const { return handle_.get(); }
   int64 params_size_in_bytes() const { return params_size_in_bytes_; }
   ParamsRegions params_weights() const {
-    if (!ok()) return ParamsRegions();
     return weights_;
   }
   ParamsRegions params_biases() const {
-    if (!ok()) return ParamsRegions();
     return biases_;
   }
 
  private:
-  int GetRegionCountPerLayer() const;
-  cudnnFilterDescriptor_t handle_;
-  const CudnnRnnDescriptor* rnn_desc_;
+  FilterDescriptor handle_;
   int64 params_size_in_bytes_;
   ParamsRegions weights_;
   ParamsRegions biases_;
@@ -992,97 +964,90 @@ class CudnnRnnParamsDescriptor : public CudnnDescriptorCommon<void> {
 
 }  // namespace
 
-class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
- public:
-  CudnnRnnDescriptor(const CudnnHandle& cudnn, int num_layers, int hidden_size,
-                     int input_size, int batch_size,
+class CudnnRnnDescriptor : public dnn::RnnDescriptor {
+  CudnnRnnDescriptor(const CudnnHandle& cudnn, cuda::RnnDescriptor rnn_desc,
+                     PersistentRnnPlan rnn_plan, int num_layers,
+                     int hidden_size, int input_size, int batch_size,
                      cudnnRNNInputMode_t input_mode,
                      cudnnDirectionMode_t direction_mode,
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
                      cudnnDataType_t compute_type,
                      const dnn::AlgorithmConfig& algorithm_config,
-                     float dropout, uint64 seed,
-                     ScratchAllocator* state_allocator)
-      : rnn_desc_(nullptr),
+                     ScopedDropoutDescriptor dropout_desc,
+                     CudnnRnnParamsDescriptor params_desc)
+      : rnn_desc_(std::move(rnn_desc)),
+        rnn_plan_(std::move(rnn_plan)),
         num_layers_(num_layers),
         hidden_size_(hidden_size),
         input_size_(input_size),
         batch_size_(batch_size),
-        rnn_plan_(nullptr),
+        rnn_algo_(ToCudnnRNNAlgo(algorithm_config.algorithm())),
         input_mode_(input_mode),
         direction_mode_(direction_mode),
         rnn_mode_(rnn_mode),
         data_type_(data_type),
         compute_type_(compute_type),
-        algorithm_config_(algorithm_config) {
-    // Create the dropout handle.
-    cudnn_dropout_desc_.reset(
-        new CudnnDropoutDescriptor(cudnn, dropout, seed, state_allocator));
-    if (!cudnn_dropout_desc_->ok()) {
-      SetFailure(cudnn_dropout_desc_->Status());
-      return;
-    }
+        algorithm_config_(algorithm_config),
+        dropout_desc_(std::move(dropout_desc)),
+        params_desc_(std::move(params_desc)) {}
+
+ public:
+  CudnnRnnDescriptor(CudnnRnnDescriptor&& other) = default;
+
+  static port::StatusOr<CudnnRnnDescriptor> Create(
+      const CudnnHandle& cudnn, int num_layers, int hidden_size, int input_size,
+      int batch_size, cudnnRNNInputMode_t input_mode,
+      cudnnDirectionMode_t direction_mode, cudnnRNNMode_t rnn_mode,
+      cudnnDataType_t data_type, cudnnDataType_t compute_type,
+      const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
+      ScratchAllocator* state_allocator) {
+    SE_ASSIGN_OR_RETURN(
+        ScopedDropoutDescriptor dropout_desc,
+        ScopedDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
+
+    cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
+    cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
 
-    // Create the RNN handle
-    cudnnStatus_t status = cudnnCreateRNNDescriptor(&rnn_desc_);
-    CUDNN_RETURN_IF_FAIL(status, "Unable to create RNN descriptor");
     // TODO: allow the user to choose an algorithm.
-    rnn_algo_ = ToCudnnRNNAlgo(algorithm_config_.algorithm());
-    status = cudnnSetRNNDescriptor_v6(
-        cudnn.handle(), /*rnnDesc=*/rnn_desc_, /*hiddenSize=*/hidden_size,
-        /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_handle(),
+    RETURN_IF_CUDNN_ERROR(cudnnSetRNNDescriptor_v6(
+        cudnn.handle(), /*rnnDesc=*/rnn_desc.get(), /*hiddenSize=*/hidden_size,
+        /*numLayers=*/num_layers, /*dropoutDesc=*/dropout_desc.handle(),
         /*inputMode=*/input_mode, /*direction=*/direction_mode,
-        /*mode=*/rnn_mode, /*algo=*/rnn_algo_, /*dataType=*/compute_type);
-    CUDNN_RETURN_IF_FAIL(status, ::tensorflow::strings::Printf(
-                                     "Unable to update RNN descriptor with "
-                                     "algo_id: %d and compute_type: %d",
-                                     static_cast<int>(rnn_algo_),
-                                     static_cast<int>(compute_type)));
-
-    if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
-      CHECK_GE(batch_size_, 0);
-      status = cudnnCreatePersistentRNNPlan(rnn_desc_, batch_size_, data_type_,
-                                            &rnn_plan_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to create persistent RNN plan.");
-      status = cudnnSetPersistentRNNPlan(rnn_desc_, rnn_plan_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to update persistent RNN plan.");
+        /*mode=*/rnn_mode, /*algo=*/rnn_algo,
+        /*dataType=*/compute_type));
+
+    PersistentRnnPlan rnn_plan;
+    if (rnn_algo == CUDNN_RNN_ALGO_PERSIST_DYNAMIC) {
+      CHECK_GE(batch_size, 0);
+      rnn_plan = CreatePersistentRnnPlan(rnn_desc.get(), batch_size, data_type);
+      RETURN_IF_CUDNN_ERROR(
+          cudnnSetPersistentRNNPlan(rnn_desc.get(), rnn_plan.get()));
     }
 
     // Create the params handle.
-    cudnn_params_desc_.reset(new CudnnRnnParamsDescriptor(cudnn, *this));
-    if (!cudnn_params_desc_->ok()) {
-      SetFailure(cudnn_params_desc_->Status());
-      return;
-    }
-    set_use_tensor_op_math(algorithm_config_.algorithm().tensor_ops_enabled());
-  }
-  ~CudnnRnnDescriptor() override {
-    if (rnn_desc_) {
-      cudnnStatus_t status;
-      if (rnn_algo_ == CUDNN_RNN_ALGO_PERSIST_DYNAMIC && rnn_plan_) {
-        status = cudnnDestroyPersistentRNNPlan(rnn_plan_);
-        CUDNN_RETURN_IF_FAIL(status, "Unable to destroy persistent RNN plan.");
-      }
-      status = cudnnDestroyRNNDescriptor(rnn_desc_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN descriptor");
-    }
-  }
-  void set_use_tensor_op_math(bool use_tensor_op_math) {
+    SE_ASSIGN_OR_RETURN(auto params_desc,
+                        CudnnRnnParamsDescriptor::Create(
+                            cudnn, input_size, data_type, rnn_desc.get(),
+                            rnn_mode, direction_mode, num_layers));
+
 #if CUDNN_VERSION >= 7000
-    cudnnMathType_t math_type =
-        (use_tensor_op_math ? CUDNN_TENSOR_OP_MATH : CUDNN_DEFAULT_MATH);
     if (RnnTensorOpMathEnabled()) {
-      cudnnStatus_t status = cudnnSetRNNMatrixMathType(rnn_desc_, math_type);
-      if (status != CUDNN_STATUS_SUCCESS) {
-        LOG(FATAL) << "could not set cudnn RNN math type: " << ToString(status);
-      }
+      cudnnMathType_t math_type =
+          algorithm_config.algorithm().tensor_ops_enabled()
+              ? CUDNN_TENSOR_OP_MATH
+              : CUDNN_DEFAULT_MATH;
+      CHECK_CUDNN_OK(cudnnSetRNNMatrixMathType(rnn_desc.get(), math_type));
     }
 #endif
+
+    return CudnnRnnDescriptor(cudnn, std::move(rnn_desc), std::move(rnn_plan),
+                              num_layers, hidden_size, input_size, batch_size,
+                              input_mode, direction_mode, rnn_mode, data_type,
+                              compute_type, algorithm_config,
+                              std::move(dropout_desc), std::move(params_desc));
   }
-  cudnnRNNDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return rnn_desc_;
-  }
+
+  cudnnRNNDescriptor_t handle() const { return rnn_desc_.get(); }
   int num_layers() const { return num_layers_; }
   int hidden_size() const { return hidden_size_; }
   int input_size() const { return input_size_; }
@@ -1096,27 +1061,21 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
     return algorithm_config_;
   }
   int64 ParamsSizeInBytes() const override {
-    return cudnn_params_desc_->params_size_in_bytes();
-  }
-  cudnnDropoutDescriptor_t dropout_handle() const {
-    if (!cudnn_dropout_desc_) return nullptr;
-    return cudnn_dropout_desc_->handle();
+    return params_desc_.params_size_in_bytes();
   }
   cudnnFilterDescriptor_t params_handle() const {
-    if (!cudnn_params_desc_) return nullptr;
-    return cudnn_params_desc_->handle();
+    return params_desc_.handle();
   }
   ParamsRegions ParamsWeightRegions() const override {
-    if (!ok()) return ParamsRegions();
-    return cudnn_params_desc_->params_weights();
+    return params_desc_.params_weights();
   }
   ParamsRegions ParamsBiasRegions() const override {
-    if (!ok()) return ParamsRegions();
-    return cudnn_params_desc_->params_biases();
+    return params_desc_.params_biases();
   }
 
  private:
-  cudnnRNNDescriptor_t rnn_desc_;
+  cuda::RnnDescriptor rnn_desc_;
+  PersistentRnnPlan rnn_plan_;
   int num_layers_;
   int hidden_size_;
   int input_size_;
@@ -1124,180 +1083,142 @@ class CudnnRnnDescriptor : public CudnnDescriptorCommon<dnn::RnnDescriptor> {
   // algorithm.
   int batch_size_;
   cudnnRNNAlgo_t rnn_algo_;
-  cudnnPersistentRNNPlan_t rnn_plan_;
   cudnnRNNInputMode_t input_mode_;
   cudnnDirectionMode_t direction_mode_;
   cudnnRNNMode_t rnn_mode_;
   cudnnDataType_t data_type_;
   cudnnDataType_t compute_type_;
   dnn::AlgorithmConfig algorithm_config_;
-  std::unique_ptr<CudnnDropoutDescriptor> cudnn_dropout_desc_;
-  std::unique_ptr<CudnnRnnParamsDescriptor> cudnn_params_desc_;
+  ScopedDropoutDescriptor dropout_desc_;
+  CudnnRnnParamsDescriptor params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
 };
 
 namespace {
 
-CudnnRnnParamsDescriptor::CudnnRnnParamsDescriptor(
-    const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc)
-    : handle_(nullptr), rnn_desc_(&rnn_desc), params_size_in_bytes_(0) {
-  cudnnTensorDescriptor_t input_desc = nullptr;
-  {
-    // Query the params size.
-    auto status = cudnnCreateTensorDescriptor(&input_desc);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create tensor descriptor");
-    int dims[] = {1, rnn_desc.input_size(), 1};
-    int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = cudnnSetTensorNdDescriptor(
-        /*tensorDesc=*/input_desc, /*dataType=*/rnn_desc.data_type(),
-        /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
-        /*strideA=*/strides);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to set tensor descriptor");
-
-    size_t params_size = 0;
-    status = cudnnGetRNNParamsSize(
-        /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-        /*xDesc=*/input_desc, /*sizeInBytes=*/&params_size,
-        /*dataType=*/rnn_desc.data_type());
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to get RNN parameter size");
-    params_size_in_bytes_ = static_cast<int64>(params_size);
-  }
-
-  {
-    // Create the params descriptor.
-    auto status = cudnnCreateFilterDescriptor(&handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create RNN filter descriptor");
-    int dims[] = {static_cast<int>(params_size_in_bytes_), 1, 1};
-    status = cudnnSetFilterNdDescriptor(
-        /*filterDesc=*/handle_, /*dataType=*/rnn_desc.data_type(),
-        /*format=*/CUDNN_TENSOR_NCHW, /*nbDims=*/sizeof(dims) / sizeof(dims[0]),
-        /*filterDimA=*/dims);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to update RNN filter descriptor");
-  }
+port::StatusOr<CudnnRnnParamsDescriptor> CudnnRnnParamsDescriptor::Create(
+    const CudnnHandle& cudnn, int input_size, cudnnDataType_t data_type,
+    cudnnRNNDescriptor_t rnn_desc, cudnnRNNMode_t rnn_mode,
+    cudnnDirectionMode_t direction_mode, int num_layers) {
+  // Query the params size.
+  TensorDescriptor input_desc = CreateTensorDescriptor();
+  int tensor_dims[] = {1, input_size, 1};
+  int strides[] = {tensor_dims[1] * tensor_dims[2], tensor_dims[2], 1};
+  RETURN_IF_CUDNN_ERROR(cudnnSetTensorNdDescriptor(
+      /*tensorDesc=*/input_desc.get(), /*dataType=*/data_type,
+      /*nbDims=*/sizeof(tensor_dims) / sizeof(tensor_dims[0]),
+      /*dimA=*/tensor_dims,
+      /*strideA=*/strides));
+
+  size_t params_size = 0;
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNParamsSize(
+      /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+      /*xDesc=*/input_desc.get(), /*sizeInBytes=*/&params_size,
+      /*dataType=*/data_type));
+  int64 params_size_in_bytes = static_cast<int64>(params_size);
+
+  FilterDescriptor filter_desc = CreateFilterDescriptor();
+  int filter_dims[] = {static_cast<int>(params_size_in_bytes), 1, 1};
+  RETURN_IF_CUDNN_ERROR(cudnnSetFilterNdDescriptor(
+      /*filterDesc=*/filter_desc.get(), /*dataType=*/data_type,
+      /*format=*/CUDNN_TENSOR_NCHW,
+      /*nbDims=*/sizeof(filter_dims) / sizeof(filter_dims[0]),
+      /*filterDimA=*/filter_dims));
+
+  // Create the weights and biases into the params buffer
+  int region_count_per_layer = [&] {
+    switch (rnn_mode) {
+      case CUDNN_RNN_RELU:
+      case CUDNN_RNN_TANH:
+        return 2;
+      case CUDNN_LSTM:
+        return 8;
+      case CUDNN_GRU:
+        return 6;
+      default:
+        LOG(FATAL) << "Invalid RNN Mode: " << static_cast<int>(rnn_mode);
+        return 0;
+    }
+  }();
 
-  {
-    // Create the weights and biases into the params buffer
-    int region_count_per_layer = GetRegionCountPerLayer();
-    cudnnFilterDescriptor_t region_desc_handle = nullptr;
-    auto status = cudnnCreateFilterDescriptor(&region_desc_handle);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to create filter descriptor");
-    const int layer_count = rnn_desc.direction_mode() == CUDNN_UNIDIRECTIONAL
-                                ? rnn_desc.num_layers()
-                                : 2 * rnn_desc.num_layers();
-    for (int layer = 0; layer < layer_count; layer++) {
-      for (int region = 0; region < region_count_per_layer; region++) {
-        for (int type = 0; type < 2; type++) {
-          void* offset = nullptr;
-          if (type == 0) {
-            status = cudnnGetRNNLinLayerMatrixParams(
-                /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-                /*layer=*/layer, /*xDesc=*/input_desc, /*wDesc=*/handle_,
-                /*w=*/nullptr, /*linLayerID=*/region,
-                /*linLayerMatDesc=*/region_desc_handle,
-                /*linLayerMat=*/&offset);
-            CUDNN_RETURN_IF_FAIL(
-                status, "Cudnn fails to call cudnnGetRNNLinLayerMatrixParams");
-          } else {
-            status = cudnnGetRNNLinLayerBiasParams(
-                /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
-                /*layer=*/layer, /*xDesc=*/input_desc, /*wDesc=*/handle_,
-                /*w=*/nullptr, /*linLayerID=*/region,
-                /*linLayerBiasDesc=*/region_desc_handle,
-                /*linLayerBias=*/&offset);
-            CUDNN_RETURN_IF_FAIL(
-                status, "Cudnn fails to call cudnnGetRNNLinLayerBiasParams");
-          }
-          int dims[] = {1, 1, 1};
-          cudnnDataType_t data_type;
-          cudnnTensorFormat_t tensor_format;
-          int n_dims;
-          status = cudnnGetFilterNdDescriptor(
-              /*filterDesc=*/region_desc_handle,
-              /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]),
-              /*dataType=*/&data_type, /*format=*/&tensor_format,
-              /*nbDims=*/&n_dims, /*filterDimA=*/dims);
-          CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to get filter description");
-          int64 size = dims[0] * dims[1] * dims[2] *
-                       CudnnDataTypeToByteSize(rnn_desc.data_type());
-          ParamsRegion region = {reinterpret_cast<int64>(offset), size};
-          if (type == 0) {
-            weights_.push_back(region);
-          } else {
-            biases_.push_back(region);
-          }
-        }
+  FilterDescriptor region_desc_handle = CreateFilterDescriptor();
+  const int layer_count =
+      direction_mode == CUDNN_UNIDIRECTIONAL ? num_layers : 2 * num_layers;
+
+  ParamsRegions weights;
+  ParamsRegions biases;
+
+  for (int layer = 0; layer < layer_count; layer++) {
+    for (int region = 0; region < region_count_per_layer; region++) {
+      for (int type = 0; type < 2; type++) {
+        void* offset = nullptr;
+        RETURN_IF_CUDNN_ERROR((type == 0 ? cudnnGetRNNLinLayerMatrixParams
+                                         : cudnnGetRNNLinLayerBiasParams)(
+            /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc,
+            /*layer=*/layer, /*xDesc=*/input_desc.get(),
+            /*wDesc=*/filter_desc.get(),
+            /*w=*/nullptr, /*linLayerID=*/region,
+            /*linLayerMatDesc=*/region_desc_handle.get(),
+            /*linLayerMat or linLayerBias=*/&offset));
+        int dims[] = {1, 1, 1};
+        cudnnDataType_t data_type;
+        cudnnTensorFormat_t tensor_format;
+        int n_dims;
+        RETURN_IF_CUDNN_ERROR(cudnnGetFilterNdDescriptor(
+            /*filterDesc=*/region_desc_handle.get(),
+            /*nbDimsRequested=*/sizeof(dims) / sizeof(dims[0]),
+            /*dataType=*/&data_type, /*format=*/&tensor_format,
+            /*nbDims=*/&n_dims, /*filterDimA=*/dims));
+        int64 size =
+            dims[0] * dims[1] * dims[2] * CudnnDataTypeToByteSize(data_type);
+        dnn::RnnDescriptor::ParamsRegion region = {
+            reinterpret_cast<int64>(offset), size};
+        (type == 0 ? weights : biases).push_back(region);
       }
     }
-    status = cudnnDestroyFilterDescriptor(region_desc_handle);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to destroy filter descriptor");
   }
 
-  {
-    // Release the dummy input tensor descriptor.
-    auto status = cudnnDestroyTensorDescriptor(input_desc);
-    CUDNN_RETURN_IF_FAIL(status, "Cudnn fails to destroy tensor descriptor");
-  }
-}
-
-int CudnnRnnParamsDescriptor::GetRegionCountPerLayer() const {
-  auto rnn_mode = rnn_desc_->rnn_mode();
-  switch (rnn_mode) {
-    case CUDNN_RNN_RELU:
-    case CUDNN_RNN_TANH:
-      return 2;
-    case CUDNN_LSTM:
-      return 8;
-    case CUDNN_GRU:
-      return 6;
-    default:
-      LOG(FATAL) << "Invalid RNN Mode: " << static_cast<int>(rnn_mode);
-  }
+  return CudnnRnnParamsDescriptor(std::move(filter_desc), params_size_in_bytes,
+                                  weights, biases);
 }
 
 }  // namespace
 
 class CudnnRnnSequenceTensorDescriptor
-    : public CudnnDescriptorCommon<dnn::RnnSequenceTensorDescriptor> {
- public:
+    : public dnn::RnnSequenceTensorDescriptor {
   CudnnRnnSequenceTensorDescriptor(CUDAExecutor* parent, int seq_length,
                                    int batch_size, int data_size,
-                                   cudnnDataType_t data_type)
+                                   cudnnDataType_t data_type,
+                                   TensorDescriptor handle)
       : parent_(parent),
         seq_length_(seq_length),
         batch_size_(batch_size),
         data_size_(data_size),
-        data_type_(data_type) {
-    cudnnTensorDescriptor_t handle = nullptr;
-    if (seq_length <= 0) {
-      string error_msg =
-          port::StrCat("sequence length must be positive: ", seq_length);
-      LOG(ERROR) << error_msg;
-      SetFailure(port::Status(port::error::UNKNOWN, error_msg));
-      return;
-    }
-    cudnnStatus_t status = cudnnCreateTensorDescriptor(&handle);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to create tensor descriptor");
+        data_type_(data_type),
+        handle_(std::move(handle)),
+        handles_(seq_length, handle_.get()) {}
+
+ public:
+  CudnnRnnSequenceTensorDescriptor(CudnnRnnSequenceTensorDescriptor&&) =
+      default;
+
+  static port::StatusOr<CudnnRnnSequenceTensorDescriptor> Create(
+      CUDAExecutor* parent, int seq_length, int batch_size, int data_size,
+      cudnnDataType_t data_type) {
+    CHECK_GT(seq_length, 0);
     int dims[] = {batch_size, data_size, 1};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = cudnnSetTensorNdDescriptor(
-        /*tensorDesc=*/handle, /*dataType=*/data_type,
+    TensorDescriptor tensor_desc = CreateTensorDescriptor();
+    RETURN_IF_CUDNN_ERROR(cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/tensor_desc.get(), /*dataType=*/data_type,
         /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
-        /*strideA=*/strides);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to update tensor descriptor");
-    // Replicate handle across the number of steps.
-    handles_.assign(seq_length, handle);
-  }
-
-  ~CudnnRnnSequenceTensorDescriptor() override {
-    // Only the first one needs to be destroyed. All others are the same.
-    cudnnStatus_t status = cudnnDestroyTensorDescriptor(handles_[0]);
-    CUDNN_RETURN_IF_FAIL(status,
-                         "Failed to destroy sequence tensor descriptor");
+        /*strideA=*/strides));
+    return CudnnRnnSequenceTensorDescriptor(parent, seq_length, batch_size,
+                                            data_size, data_type,
+                                            std::move(tensor_desc));
   }
 
   const cudnnTensorDescriptor_t* handles() const {
-    if (!ok()) return nullptr;
-    CHECK(!handles_.empty()) << "handles cannot be empty";
     return handles_.data();
   }
 
@@ -1311,51 +1232,39 @@ class CudnnRnnSequenceTensorDescriptor
   int batch_size_;
   int data_size_;
   cudnnDataType_t data_type_;
-  std::vector<cudnnTensorDescriptor_t> handles_;
+  TensorDescriptor handle_;
+  std::vector<cudnnTensorDescriptor_t> handles_;  // Copies of handle_.
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnSequenceTensorDescriptor);
 };
 
-class CudnnRnnStateTensorDescriptor
-    : public CudnnDescriptorCommon<dnn::RnnStateTensorDescriptor> {
+class CudnnRnnStateTensorDescriptor : public dnn::RnnStateTensorDescriptor {
  public:
   CudnnRnnStateTensorDescriptor(CUDAExecutor* parent, int num_layers,
                                 int batch_size, int data_size,
                                 cudnnDataType_t data_type)
       : parent_(parent),
-        handle_(nullptr),
+        handle_(CreateTensorDescriptor()),
         num_layers_(num_layers),
         batch_size_(batch_size),
         data_size_(data_size),
         data_type_(data_type) {
-    cudnnStatus_t status = cudnnCreateTensorDescriptor(&handle_);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to create tensor descriptor");
     int dims[] = {num_layers, batch_size, data_size};
     int strides[] = {dims[1] * dims[2], dims[2], 1};
-    status = cudnnSetTensorNdDescriptor(
-        /*tensorDesc=*/handle_, /*dataType=*/data_type,
+    CHECK_CUDNN_OK(cudnnSetTensorNdDescriptor(
+        /*tensorDesc=*/handle_.get(), /*dataType=*/data_type,
         /*nbDims=*/sizeof(dims) / sizeof(dims[0]), /*dimA=*/dims,
-        /*strideA=*/strides);
-    CUDNN_RETURN_IF_FAIL(status, "Failed to update tensor descriptor");
+        /*strideA=*/strides));
   }
 
-  ~CudnnRnnStateTensorDescriptor() override {
-    if (!handle_) {
-      cudnnStatus_t status = cudnnDestroyTensorDescriptor(handle_);
-      CUDNN_RETURN_IF_FAIL(status, "Unable to destroy RNN state tensor");
-    }
-  }
+  cudnnTensorDescriptor_t handle() const { return handle_.get(); }
 
-  cudnnTensorDescriptor_t handle() const {
-    if (!ok()) return nullptr;
-    return handle_;
-  }
   int num_layers() const { return num_layers_; }
   int batch_size() const { return batch_size_; }
   int data_size() const { return data_size_; }
 
  private:
   CUDAExecutor* parent_;
-  cudnnTensorDescriptor_t handle_;
+  TensorDescriptor handle_;
   int num_layers_;
   int batch_size_;
   int data_size_;
@@ -1375,7 +1284,7 @@ struct RnnModelDims {
 };
 
 template <class T>
-bool ExtractAndCheckRnnForward(
+port::StatusOr<RnnModelDims> ExtractAndCheckRnnForward(
     const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1388,103 +1297,89 @@ bool ExtractAndCheckRnnForward(
     const CudnnRnnStateTensorDescriptor& output_h_desc,
     const DeviceMemory<T>& output_h_data,
     const CudnnRnnStateTensorDescriptor& output_c_desc,
-    const DeviceMemory<T>& output_c_data, RnnModelDims* model_dims) {
+    const DeviceMemory<T>& output_c_data) {
   // extract model parameters
-  model_dims->num_layers = rnn_desc.num_layers();
-  model_dims->batch_size = input_desc.batch_size();
-  model_dims->seq_length = input_desc.seq_length();
-  model_dims->hidden_size = rnn_desc.hidden_size();
-  model_dims->input_size = input_desc.data_size();
-  model_dims->dir_count =
+  RnnModelDims model_dims;
+  model_dims.num_layers = rnn_desc.num_layers();
+  model_dims.batch_size = input_desc.batch_size();
+  model_dims.seq_length = input_desc.seq_length();
+  model_dims.hidden_size = rnn_desc.hidden_size();
+  model_dims.input_size = input_desc.data_size();
+  model_dims.dir_count =
       (rnn_desc.direction_mode() == CUDNN_BIDIRECTIONAL) ? 2 : 1;
 
   // check parameters
   if (!(input_h_desc.num_layers() ==
-            model_dims->num_layers * model_dims->dir_count &&
-        input_h_desc.batch_size() == model_dims->batch_size &&
-        input_h_desc.data_size() == model_dims->hidden_size)) {
-    LOG(ERROR) << "Invalid input_h shape";
-    return false;
+            model_dims.num_layers * model_dims.dir_count &&
+        input_h_desc.batch_size() == model_dims.batch_size &&
+        input_h_desc.data_size() == model_dims.hidden_size)) {
+    return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_h shape");
   }
   if (!(input_h_desc.num_layers() == input_c_desc.num_layers() &&
         input_h_desc.batch_size() == input_c_desc.batch_size() &&
         input_h_desc.data_size() == input_c_desc.data_size())) {
-    LOG(ERROR) << "Invalid input_c shape";
-    return false;
+    return port::Status(port::error::INVALID_ARGUMENT, "Invalid input_c shape");
   }
-  if (!(output_desc.seq_length() == model_dims->seq_length &&
-        output_desc.batch_size() == model_dims->batch_size &&
+  if (!(output_desc.seq_length() == model_dims.seq_length &&
+        output_desc.batch_size() == model_dims.batch_size &&
         output_desc.data_size() ==
-            model_dims->hidden_size * model_dims->dir_count)) {
-    LOG(ERROR) << "Invalid output shape";
-    return false;
+            model_dims.hidden_size * model_dims.dir_count)) {
+    return port::Status(port::error::INVALID_ARGUMENT, "Invalid output shape");
   }
   if (!(input_h_desc.num_layers() == output_h_desc.num_layers() &&
         input_h_desc.batch_size() == output_h_desc.batch_size() &&
         input_h_desc.data_size() == output_h_desc.data_size())) {
-    LOG(ERROR) << "Invalid output_h shape";
-    return false;
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Invalid output_h shape");
   }
   if (!(input_h_desc.num_layers() == output_c_desc.num_layers() &&
         input_h_desc.batch_size() == output_c_desc.batch_size() &&
         input_h_desc.data_size() == output_c_desc.data_size())) {
-    LOG(ERROR) << "Invalid output_h shape";
-    return false;
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Invalid output_c shape");
   }
 
-  return true;
+  return model_dims;
 }
 
-bool CheckRNNParameterSize(const CudnnHandle& cudnn,
-                           const CudnnRnnDescriptor& rnn_desc,
-                           const CudnnRnnSequenceTensorDescriptor& input_desc) {
+port::Status CheckRNNParameterSize(
+    const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc,
+    const CudnnRnnSequenceTensorDescriptor& input_desc) {
   size_t params_size_in_bytes = 0;
-  cudnnStatus_t status = cudnnGetRNNParamsSize(
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNParamsSize(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
       /*xDesc=*/input_desc.handles()[0], /*sizeInBytes=*/&params_size_in_bytes,
-      /*dataType=*/rnn_desc.data_type());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Unable to check RNN param size: " << ToString(status);
-    return false;
+      /*dataType=*/rnn_desc.data_type()));
+  if (static_cast<int64>(params_size_in_bytes) !=
+      rnn_desc.ParamsSizeInBytes()) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "Mismatching RNN parameter size");
   }
-  return static_cast<int64>(params_size_in_bytes) ==
-         rnn_desc.ParamsSizeInBytes();
+  return port::Status::OK();
 }
 
-bool CreateRnnWorkspace(Stream* stream, const CudnnHandle& cudnn,
-                        const CudnnRnnDescriptor& rnn_desc,
-                        const CudnnRnnSequenceTensorDescriptor& input_desc,
-                        ScratchAllocator* workspace_allocator,
-                        DeviceMemory<uint8>* workspace) {
+port::StatusOr<DeviceMemory<uint8>> CreateRnnWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const CudnnRnnDescriptor& rnn_desc,
+    const CudnnRnnSequenceTensorDescriptor& input_desc,
+    ScratchAllocator* workspace_allocator) {
   // Query the workspace size.
   size_t workspace_size_in_bytes = 0;
-  cudnnStatus_t status = cudnnGetRNNWorkspaceSize(
+  RETURN_IF_CUDNN_ERROR(cudnnGetRNNWorkspaceSize(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
       /*seqLength=*/input_desc.seq_length(), /*xDesc=*/input_desc.handles(),
-      /*sizeInBytes=*/&workspace_size_in_bytes);
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Unable to query workspace size: " << ToString(status);
-    return false;
-  }
+      /*sizeInBytes=*/&workspace_size_in_bytes));
   // Allocate the workspace.
-  if (workspace_size_in_bytes > 0) {
-    auto allocated =
-        workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
-    if (!allocated.ok() || (*workspace = allocated.ValueOrDie()) == nullptr) {
-      LOG(ERROR) << port::StrCat("Failed to allocate RNN workspace of ",
-                                 workspace_size_in_bytes, " bytes.");
-      return false;
-    }
-  } else {
-    *workspace = DeviceMemory<uint8>();
+  if (workspace_size_in_bytes == 0) {
+    return DeviceMemory<uint8>();
   }
-  return true;
+  return workspace_allocator->AllocateBytes(stream, workspace_size_in_bytes);
 }
 
 }  // namespace
 
 template <class T>
-bool CudnnSupport::DoRnnForwardImpl(
+port::Status CudnnSupport::DoRnnForwardImpl(
     Stream* stream, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1501,57 +1396,34 @@ bool CudnnSupport::DoRnnForwardImpl(
     ScratchAllocator* reserve_space_allocator,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-  // extract model parameters
-  RnnModelDims model_dims;
-  bool res = ExtractAndCheckRnnForward(
-      rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-      input_c_desc, input_c_data, params, output_desc, *output_data,
-      output_h_desc, *output_h_data, output_c_desc, *output_c_data,
-      &model_dims);
-  if (!res) {
-    LOG(ERROR) << "Invalid parameters for RNN Model";
-    return false;
-  }
+  SE_ASSIGN_OR_RETURN(
+      RnnModelDims model_dims,
+      ExtractAndCheckRnnForward(
+          rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
+          input_c_desc, input_c_data, params, output_desc, *output_data,
+          output_h_desc, *output_h_data, output_c_desc, *output_c_data));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  // check params size
-  if (!CheckRNNParameterSize(cudnn, rnn_desc, input_desc)) {
-    LOG(ERROR) << "Invalid parameters";
-    return false;
-  }
-
-  // create the workspace
-  DeviceMemory<uint8> workspace;
-  if (!CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
-                          workspace_allocator, &workspace)) {
-    LOG(ERROR) << "Unable to create rnn workspace";
-    return false;
-  }
+  SE_RETURN_IF_ERROR(CheckRNNParameterSize(cudnn, rnn_desc, input_desc));
+  SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace,
+                      CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
+                                         workspace_allocator))
 
   // query the reserve space size
   // allocate the reserve space
   DeviceMemory<uint8> reserve_space;
   if (is_training) {
     size_t reserve_space_size_in_bytes = 0;
-    cudnnStatus_t status = cudnnGetRNNTrainingReserveSize(
+    RETURN_IF_CUDNN_ERROR(cudnnGetRNNTrainingReserveSize(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
-        /*sizeInBytes=*/&reserve_space_size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      LOG(ERROR) << "Unable to query reserve space size: " << ToString(status);
-      return false;
-    }
+        /*sizeInBytes=*/&reserve_space_size_in_bytes));
 
     if (reserve_space_size_in_bytes > 0) {
-      auto allocated = reserve_space_allocator->AllocateBytes(
-          stream, reserve_space_size_in_bytes);
-      if (!allocated.ok() ||
-          (reserve_space = allocated.ValueOrDie()) == nullptr) {
-        LOG(ERROR) << "Failed to allocate RNN reserve space of "
-                   << reserve_space_size_in_bytes << " bytes.";
-        return false;
-      }
+      SE_ASSIGN_OR_RETURN(reserve_space,
+                          reserve_space_allocator->AllocateBytes(
+                              stream, reserve_space_size_in_bytes));
     }
   }
 
@@ -1559,20 +1431,16 @@ bool CudnnSupport::DoRnnForwardImpl(
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
-  // make the forward call
-  cudnnStatus_t status;
+
   if (!is_training) {
-    status = cudnnRNNForwardInference(
+    RETURN_IF_CUDNN_ERROR(cudnnRNNForwardInference(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
         /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
@@ -1582,9 +1450,9 @@ bool CudnnSupport::DoRnnForwardImpl(
         /*y=*/output_data->opaque(), /*hyDesc=*/output_h_desc.handle(),
         /*hy=*/output_h_data->opaque(), /*cyDesc=*/output_c_desc.handle(),
         /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
-        /*workSpaceSizeInBytes=*/workspace.size());
+        /*workSpaceSizeInBytes=*/workspace.size()));
   } else {
-    status = cudnnRNNForwardTraining(
+    RETURN_IF_CUDNN_ERROR(cudnnRNNForwardTraining(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
         /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
@@ -1596,35 +1464,24 @@ bool CudnnSupport::DoRnnForwardImpl(
         /*cy=*/output_c_data->opaque(), /*workspace=*/workspace.opaque(),
         /*workSpaceSizeInBytes=*/workspace.size(),
         /*reserveSpace=*/reserve_space.opaque(),
-        /*reserveSpaceSizeInBytes=*/reserve_space.size());
+        /*reserveSpaceSizeInBytes=*/reserve_space.size()));
   }
+
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      return false;
-    }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      auto algo_desc = rnn_desc.algorithm_config().algorithm();
-      output_profile_result->set_algorithm(algo_desc);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "Failed to call "
-                 << (is_training ? "cudnnRNNForwardTraining "
-                                 : "cudnnRNNForwardInference ")
-                 << ToString(status);
-      return false;
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
+    auto algo_desc = rnn_desc.algorithm_config().algorithm();
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
 template <class T>
-bool CudnnSupport::DoRnnBackwardImpl(
+port::Status CudnnSupport::DoRnnBackwardImpl(
     Stream* stream, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc,
     const DeviceMemory<T>& input_data,
@@ -1648,53 +1505,38 @@ bool CudnnSupport::DoRnnBackwardImpl(
     DeviceMemory<uint8>* reserve_space_data,
     ScratchAllocator* workspace_allocator,
     dnn::ProfileResult* output_profile_result) {
-  // extract model parameters
-  RnnModelDims model_dims;
-  bool res = ExtractAndCheckRnnForward(
-      rnn_desc, input_desc, input_data, input_h_desc, input_h_data,
-      input_c_desc, input_c_data, params, output_desc, output_data,
-      output_h_desc, output_h_data, output_c_desc, output_c_data, &model_dims);
-  if (!res) {
-    LOG(ERROR) << "Invalid parameters for RNN Model";
-    return false;
-  }
+  SE_ASSIGN_OR_RETURN(
+      RnnModelDims model_dims,
+      ExtractAndCheckRnnForward(rnn_desc, input_desc, input_data, input_h_desc,
+                                input_h_data, input_c_desc, input_c_data,
+                                params, output_desc, output_data, output_h_desc,
+                                output_h_data, output_c_desc, output_c_data));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  // check params size
-  if (!CheckRNNParameterSize(cudnn, rnn_desc, input_desc)) {
-    LOG(ERROR) << "Invalid parameters";
-    return false;
-  }
-
-  // create the workspace
-  DeviceMemory<uint8> workspace;
-  if (!CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
-                          workspace_allocator, &workspace)) {
-    LOG(ERROR) << "Unable to create rnn workspace";
-    return false;
-  }
+  SE_RETURN_IF_ERROR(CheckRNNParameterSize(cudnn, rnn_desc, input_desc));
+  SE_ASSIGN_OR_RETURN(DeviceMemory<uint8> workspace,
+                      CreateRnnWorkspace(stream, cudnn, rnn_desc, input_desc,
+                                         workspace_allocator));
 
   std::unique_ptr<CUDATimer, TimerDeleter> timer;
   const bool is_profiling = output_profile_result != nullptr;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
-  // make the backward data call
-  cudnnStatus_t status = cudnnRNNBackwardData(
+
+  RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardData(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
       /*seqLength=*/model_dims.seq_length, /*yDesc=*/output_desc.handles(),
       /*y=*/output_data.opaque(), /*dyDesc=*/output_desc.handles(),
-      /*dy=*/output_backprop_data.opaque(), /*dhyDesc=*/output_h_desc.handle(),
+      /*dy=*/output_backprop_data.opaque(),
+      /*dhyDesc=*/output_h_desc.handle(),
       /*dhy=*/output_h_backprop_data.opaque(),
       /*dcyDesc=*/output_c_desc.handle(),
       /*dcy=*/output_c_backprop_data.opaque(),
@@ -1705,24 +1547,17 @@ bool CudnnSupport::DoRnnBackwardImpl(
       /*dhxDesc=*/input_h_desc.handle(),
       /*dhx=*/input_h_backprop_data->opaque(),
       /*dcxDesc=*/input_c_desc.handle(),
-      /*dcx=*/input_c_backprop_data->opaque(), /*workspace=*/workspace.opaque(),
+      /*dcx=*/input_c_backprop_data->opaque(),
+      /*workspace=*/workspace.opaque(),
       /*workSpaceSizeInBytes=*/workspace.size(),
       /*reserveSpace=*/reserve_space_data->opaque(),
-      /*reserveSpaceSizeInBytes=*/reserve_space_data->size());
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    if (is_profiling) {
-      timer->Stop(AsCUDAStream(stream));
-    }
-    LOG(ERROR) << "Failed to call cudnnRNNBackwardData: " << ToString(status);
-    return false;
-  }
+      /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
 
   if (params_backprop_data != nullptr) {
     // Clear the dw to zeros.
     stream->ThenMemZero(params_backprop_data, params_backprop_data->size());
     // make the backward weight call
-    status = cudnnRNNBackwardWeights(
+    RETURN_IF_CUDNN_ERROR(cudnnRNNBackwardWeights(
         /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
         /*seqLength=*/model_dims.seq_length, /*xDesc=*/input_desc.handles(),
         /*x=*/input_data.opaque(), /*hxDesc=*/input_h_desc.handle(),
@@ -1732,19 +1567,12 @@ bool CudnnSupport::DoRnnBackwardImpl(
         /*dwDesc=*/rnn_desc.params_handle(),
         /*dw=*/params_backprop_data->opaque(),
         /*reserveSpace=*/reserve_space_data->opaque(),
-        /*reserveSpaceSizeInBytes=*/reserve_space_data->size());
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        timer->Stop(AsCUDAStream(stream));
-      }
-      LOG(ERROR) << "Failed to call cudnnRNNBackwardWeights: "
-                 << ToString(status);
-      return false;
-    }
+        /*reserveSpaceSizeInBytes=*/reserve_space_data->size()));
   }
+
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      return false;
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
     auto algo_desc = rnn_desc.algorithm_config().algorithm();
     output_profile_result->set_algorithm(algo_desc);
@@ -1752,7 +1580,7 @@ bool CudnnSupport::DoRnnBackwardImpl(
         timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
@@ -1765,46 +1593,37 @@ CudnnSupport::createRnnDescriptor(
   // Setting up a cudnnRNNDescriptor requires a cuDNN handle, but because it's
   // not enqueueing anything into a stream, we pass in the null stream.
   auto cudnn = cudnn_->GetHandle(parent_, /*stream=*/nullptr);
-  std::unique_ptr<CudnnRnnDescriptor> rnn_desc(new CudnnRnnDescriptor(
-      cudnn, num_layers, hidden_size, input_size, batch_size,
-      ToCudnnRnnInputMode(input_mode), ToCudnnRnnDirectionMode(direction_mode),
-      ToCudnnRnnMode(rnn_mode), ToCudnnDataType(data_type),
-      GetRnnComputeType(data_type), algorithm_config, dropout, seed,
-      state_allocator));
-  if (!rnn_desc->ok()) {
-    return rnn_desc->Status();
-  }
-  return port::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>(
-      std::move(rnn_desc));
+  SE_ASSIGN_OR_RETURN(
+      CudnnRnnDescriptor rnn_desc,
+      CudnnRnnDescriptor::Create(
+          cudnn, num_layers, hidden_size, input_size, batch_size,
+          ToCudnnRnnInputMode(input_mode),
+          ToCudnnRnnDirectionMode(direction_mode), ToCudnnRnnMode(rnn_mode),
+          ToCudnnDataType(data_type), GetRnnComputeType(data_type),
+          algorithm_config, dropout, seed, state_allocator));
+  return std::unique_ptr<dnn::RnnDescriptor>(
+      new CudnnRnnDescriptor(std::move(rnn_desc)));
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
 CudnnSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                                 int data_size,
                                                 dnn::DataType data_type) {
-  std::unique_ptr<CudnnRnnSequenceTensorDescriptor> seq_desc(
-      new CudnnRnnSequenceTensorDescriptor(parent_, seq_length, batch_size,
-                                           data_size,
-                                           ToCudnnDataType(data_type)));
-  if (!seq_desc->ok()) {
-    return seq_desc->Status();
-  }
-  return port::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>(
-      std::move(seq_desc));
+  SE_ASSIGN_OR_RETURN(CudnnRnnSequenceTensorDescriptor descriptor,
+                      CudnnRnnSequenceTensorDescriptor::Create(
+                          parent_, seq_length, batch_size, data_size,
+                          ToCudnnDataType(data_type)));
+  return std::unique_ptr<dnn::RnnSequenceTensorDescriptor>(
+      new CudnnRnnSequenceTensorDescriptor(std::move(descriptor)));
 }
 
 port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
 CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
                                              int data_size,
                                              dnn::DataType data_type) {
-  std::unique_ptr<CudnnRnnStateTensorDescriptor> state_desc(
+  return std::unique_ptr<dnn::RnnStateTensorDescriptor>(
       new CudnnRnnStateTensorDescriptor(parent_, num_layer, batch_size,
                                         data_size, ToCudnnDataType(data_type)));
-  if (!state_desc->ok()) {
-    return state_desc->Status();
-  }
-  return port::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>(
-      std::move(state_desc));
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -1840,12 +1659,14 @@ bool CudnnSupport::DoRnnForward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<Eigen::half>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
-      output_profile_result);
+  return IsStatusOk(
+      DoRnnForwardImpl<Eigen::half>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -1880,12 +1701,14 @@ bool CudnnSupport::DoRnnForward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<float>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
-      output_profile_result);
+  return IsStatusOk(
+      DoRnnForwardImpl<float>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnForward(
@@ -1921,12 +1744,14 @@ bool CudnnSupport::DoRnnForward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnForwardImpl<double>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, is_training, reserve_space_allocator, workspace_allocator,
-      output_profile_result);
+  return IsStatusOk(
+      DoRnnForwardImpl<double>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data, is_training,
+          reserve_space_allocator, workspace_allocator, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -1969,14 +1794,17 @@ bool CudnnSupport::DoRnnBackward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<Eigen::half>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, output_backprop_data, output_h_backprop_data,
-      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
-      input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnBackwardImpl<Eigen::half>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2018,14 +1846,17 @@ bool CudnnSupport::DoRnnBackward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<float>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, output_backprop_data, output_h_backprop_data,
-      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
-      input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnBackwardImpl<float>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoRnnBackward(
@@ -2068,121 +1899,358 @@ bool CudnnSupport::DoRnnBackward(
   const CudnnRnnStateTensorDescriptor& cudnn_output_c_desc =
       static_cast<const CudnnRnnStateTensorDescriptor&>(output_c_desc);
 
-  return DoRnnBackwardImpl<double>(
-      stream, cudnn_rnn_desc, cudnn_input_desc, input_data, cudnn_input_h_desc,
-      input_h_data, cudnn_input_c_desc, input_c_data, params, cudnn_output_desc,
-      output_data, cudnn_output_h_desc, output_h_data, cudnn_output_c_desc,
-      output_c_data, output_backprop_data, output_h_backprop_data,
-      output_c_backprop_data, input_backprop_data, input_h_backprop_data,
-      input_c_backprop_data, params_backprop_data, reserve_space_data,
-      workspace_allocator, output_profile_result);
+  return IsStatusOk(
+      DoRnnBackwardImpl<double>(
+          stream, cudnn_rnn_desc, cudnn_input_desc, input_data,
+          cudnn_input_h_desc, input_h_data, cudnn_input_c_desc, input_c_data,
+          params, cudnn_output_desc, output_data, cudnn_output_h_desc,
+          output_h_data, cudnn_output_c_desc, output_c_data,
+          output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+          input_backprop_data, input_h_backprop_data, input_c_backprop_data,
+          params_backprop_data, reserve_space_data, workspace_allocator,
+          output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
-namespace {
+namespace {
+
+// TODO(csigg): Merge a lot of duplicate code below for forward, backward data,
+// and backward filter.
+
+port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
+    const CudnnHandle& cudnn, const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
+    size_t memory_limit_bytes) {
+  cudnnConvolutionFwdPreference_t preference =
+      specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
+                              : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+  cudnnConvolutionFwdAlgo_t algo_to_use;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardAlgorithm(
+      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
+      output_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
+  return algo_to_use;
+}
+
+port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
+GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
+                                    const ScopedTensorDescriptor& input_nd,
+                                    const ScopedFilterDescriptor& filter,
+                                    const ScopedConvolutionDescriptor& conv,
+                                    const ScopedTensorDescriptor& output_nd,
+                                    bool specify_workspace_limit,
+                                    size_t memory_limit_bytes) {
+  cudnnConvolutionBwdDataPreference_t preference =
+      specify_workspace_limit
+          ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
+          : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
+  cudnnConvolutionBwdDataAlgo_t algo_to_use;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataAlgorithm(
+      cudnn.handle(), filter.handle(), output_nd.handle(), conv.handle(),
+      input_nd.handle(), preference, memory_limit_bytes, &algo_to_use));
+  return algo_to_use;
+}
+
+port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
+GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
+                                      const ScopedTensorDescriptor& input_nd,
+                                      const ScopedFilterDescriptor& filter,
+                                      const ScopedConvolutionDescriptor& conv,
+                                      const ScopedTensorDescriptor& output_nd,
+                                      bool specify_workspace_limit,
+                                      size_t memory_limit_bytes) {
+  cudnnConvolutionBwdFilterPreference_t preference =
+      specify_workspace_limit
+          ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
+          : CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
+  cudnnConvolutionBwdFilterAlgo_t algo_to_use;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterAlgorithm(
+      cudnn.handle(), input_nd.handle(), output_nd.handle(), conv.handle(),
+      filter.handle(), preference, memory_limit_bytes, &algo_to_use));
+  return algo_to_use;
+}
+
+port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator) {
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+
+  // Query the size of the workspace and allocate it.
+  size_t size_in_bytes;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionForwardWorkspaceSize(
+      cudnn.handle(),
+      /*xDesc=*/input_nd.handle(),
+      /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
+      /*yDesc=*/output_nd.handle(), /*algo=*/ToConvForwardAlgo(algorithm_desc),
+      /*sizeInBytes=*/&size_in_bytes));
+  int64 size_in_bytes_int64 = size_in_bytes;
+
+  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
+    return port::Status(
+        port::error::INTERNAL,
+        "cudnnGetConvolutionForwardWorkspaceSize() returned "
+        "negative sizeInBytes value. This could be a cudnn bug.");
+  }
+
+  if (size_in_bytes_int64 == 0) {
+    return DeviceMemory<uint8>();
+  }
+
+  if (TF_PREDICT_FALSE(!scratch_allocator)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No scratch allocator provided");
+  }
+
+  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+}
+
+port::StatusOr<DeviceMemory<uint8>>
+AllocateCudnnConvolutionBackwardDataWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator) {
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+
+  // Query the size of the workspace and allocate it.
+  size_t size_in_bytes;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardDataWorkspaceSize(
+      cudnn.handle(),
+      /*wDesc=*/filter.handle(),
+      /*dyDesc=*/output_nd.handle(),
+      /*convDesc=*/conv.handle(),
+      /*dxDesc=*/input_nd.handle(),
+      /*algo=*/ToConvBackwardDataAlgo(algorithm_desc),
+      /*sizeInBytes=*/&size_in_bytes));
+  int64 size_in_bytes_int64 = size_in_bytes;
+
+  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
+    return port::Status(
+        port::error::INTERNAL,
+        "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
+        "negative sizeInBytes value. This could be a cudnn bug.");
+  }
+
+  if (size_in_bytes_int64 == 0) {
+    return DeviceMemory<uint8>();
+  }
+
+  if (TF_PREDICT_FALSE(!scratch_allocator)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No scratch allocator provided");
+  }
+
+  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+}
+
+port::StatusOr<DeviceMemory<uint8>>
+AllocateCudnnConvolutionBackwardFilterWorkspace(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmDesc& algorithm_desc,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator) {
+  // TODO(csigg): This has side effects on the convolution descriptor. It is
+  // functionally correct because the convolution is run with the algorithm of
+  // the last call to this function, but should be fixed anyway.
+  conv.set_use_tensor_op_math(algorithm_desc.tensor_ops_enabled());
+
+  // Query the size of the workspace and allocate it.
+  size_t size_in_bytes;
+  RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionBackwardFilterWorkspaceSize(
+      cudnn.handle(),
+      /*xDesc=*/input_nd.handle(),
+      /*dyDesc=*/output_nd.handle(),
+      /*convDesc=*/conv.handle(),
+      /*gradDesc=*/filter.handle(),
+      /*algo=*/ToConvBackwardFilterAlgo(algorithm_desc),
+      /*sizeInBytes=*/&size_in_bytes));
+  int64 size_in_bytes_int64 = size_in_bytes;
+
+  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
+    return port::Status(
+        port::error::INTERNAL,
+        "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
+        "negative sizeInBytes value. This could be a cudnn bug.");
+  }
+
+  if (size_in_bytes_int64 == 0) {
+    return DeviceMemory<uint8>();
+  }
+
+  if (TF_PREDICT_FALSE(!scratch_allocator)) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "No scratch allocator provided");
+  }
+
+  return scratch_allocator->AllocateBytes(stream, size_in_bytes);
+}
+
+port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmConfig& algorithm_config,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
+  if (algorithm_config.algorithm().is_default()) {
+    // Pick fastest algorithm within memory limit according to cuDNN's
+    // heuristics.
+    bool specify_workspace_limit = scratch_allocator != nullptr;
+    auto memory_limit_bytes =
+        specify_workspace_limit
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            : 0ll;
+    SE_ASSIGN_OR_RETURN(cudnnConvolutionFwdAlgo_t algo,
+                        GetCudnnConvolutionForwardAlgo(
+                            cudnn, input_nd, filter, conv, output_nd,
+                            specify_workspace_limit, memory_limit_bytes));
+    algo_desc = dnn::AlgorithmDesc(
+        algo, algorithm_config.algorithm().tensor_ops_enabled());
+  }
+
+  auto scratch_or = AllocateCudnnConvolutionForwardWorkspace(
+      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      scratch_allocator);
+
+  if (scratch_or.ok()) {
+    *scratch = scratch_or.ValueOrDie();
+    return algo_desc;
+  }
+
+  // Failed to allocate workspace for the first algorithm, fall back to the
+  // no_scratch algorithm.
+  if (algorithm_config.algorithm_no_scratch().is_default()) {
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        "The primary convolution algorithm failed memory allocation, "
+        "while a secondary algorithm is not provided.");
+  }
+
+  SE_ASSIGN_OR_RETURN(
+      *scratch, AllocateCudnnConvolutionForwardWorkspace(
+                    stream, cudnn, algorithm_config.algorithm_no_scratch(),
+                    input_nd, filter, conv, output_nd, scratch_allocator));
+  return algorithm_config.algorithm_no_scratch();
+}
+
+port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
+    Stream* stream, const CudnnHandle& cudnn,
+    const dnn::AlgorithmConfig& algorithm_config,
+    const ScopedTensorDescriptor& input_nd,
+    const ScopedFilterDescriptor& filter,
+    const ScopedConvolutionDescriptor& conv,
+    const ScopedTensorDescriptor& output_nd,
+    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
+  if (algorithm_config.algorithm().is_default()) {
+    // Pick fastest algorithm within memory limit according to cuDNN's
+    // heuristics.
+    bool specify_workspace_limit = scratch_allocator != nullptr;
+    auto memory_limit_bytes =
+        specify_workspace_limit
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            : 0ll;
+    SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdDataAlgo_t algo,
+                        GetCudnnConvolutionBackwardDataAlgo(
+                            cudnn, input_nd, filter, conv, output_nd,
+                            specify_workspace_limit, memory_limit_bytes));
+    algo_desc = dnn::AlgorithmDesc(
+        algo, algorithm_config.algorithm().tensor_ops_enabled());
+  }
+
+  auto scratch_or = AllocateCudnnConvolutionBackwardDataWorkspace(
+      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      scratch_allocator);
+
+  if (scratch_or.ok()) {
+    *scratch = scratch_or.ValueOrDie();
+    return algo_desc;
+  }
 
-inline cudnnConvolutionFwdAlgo_t GetCudnnConvolutionForwardAlgo(
-    const CudnnHandle& cudnn, const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
-    size_t memory_limit_bytes) {
-  cudnnConvolutionFwdPreference_t preference =
-      specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                              : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
+  // Failed to allocate workspace for the first algorithm, fall back to the
+  // no_scratch algorithm.
+  if (algorithm_config.algorithm_no_scratch().is_default()) {
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        "The primary convolution algorithm failed memory allocation, "
+        "while a secondary algorithm is not provided.");
+  }
 
-  cudnnConvolutionFwdAlgo_t algo_to_use;
-  auto status = cudnnGetConvolutionForwardAlgorithm(
-      cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
-      output_nd.handle(), preference, memory_limit_bytes, &algo_to_use);
-  CHECK_EQ(status, CUDNN_STATUS_SUCCESS)
-      << "Unable to find a suitable algorithm for doing forward convolution";
-  return algo_to_use;
+  SE_ASSIGN_OR_RETURN(
+      *scratch, AllocateCudnnConvolutionBackwardDataWorkspace(
+                    stream, cudnn, algorithm_config.algorithm_no_scratch(),
+                    input_nd, filter, conv, output_nd, scratch_allocator));
+  return algorithm_config.algorithm_no_scratch();
 }
 
-dnn::AlgorithmDesc GetCudnnConvolutionForwardAlgorithm(
+port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
-    const dnn::AlgorithmConfig& algorithm_config, bool is_profiling,
+    const dnn::AlgorithmConfig& algorithm_config,
     const ScopedTensorDescriptor& input_nd,
     const ScopedFilterDescriptor& filter,
     const ScopedConvolutionDescriptor& conv,
     const ScopedTensorDescriptor& output_nd,
     ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
-  cudnnConvolutionFwdAlgo_t algo;
-  bool use_tensor_ops;
+  dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
   if (algorithm_config.algorithm().is_default()) {
-    use_tensor_ops = true;
-
+    // Pick fastest algorithm within memory limit according to cuDNN's
+    // heuristics.
+    bool specify_workspace_limit = scratch_allocator != nullptr;
     auto memory_limit_bytes =
-        scratch_allocator == nullptr
-            ? 0
-            : scratch_allocator->GetMemoryLimitInBytes(stream);
-    if (memory_limit_bytes < 0) {
-      memory_limit_bytes = 0;
-    }
-
-    algo = GetCudnnConvolutionForwardAlgo(
-        cudnn, input_nd, filter, conv, output_nd,
-        /*specify_workspace_limit=*/scratch_allocator != nullptr,
-        memory_limit_bytes);
-  } else {
-    use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-    algo = ToConvForwardAlgo(algorithm_config.algorithm());
+        specify_workspace_limit
+            ? std::max(scratch_allocator->GetMemoryLimitInBytes(stream), 0ll)
+            : 0ll;
+    SE_ASSIGN_OR_RETURN(cudnnConvolutionBwdFilterAlgo_t algo,
+                        GetCudnnConvolutionBackwardFilterAlgo(
+                            cudnn, input_nd, filter, conv, output_nd,
+                            specify_workspace_limit, memory_limit_bytes));
+    algo_desc = dnn::AlgorithmDesc(
+        algo, algorithm_config.algorithm().tensor_ops_enabled());
   }
-  size_t size_in_bytes;
-  auto status = cudnnGetConvolutionForwardWorkspaceSize(
-      cudnn.handle(),
-      /*xDesc=*/input_nd.handle(),
-      /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-      /*yDesc=*/output_nd.handle(), /*algo=*/algo,
-      /*sizeInBytes=*/&size_in_bytes);
-  int64 size_in_bytes_int64 = size_in_bytes;
-  if (TF_PREDICT_FALSE(status != CUDNN_STATUS_SUCCESS)) {
-    CHECK(is_profiling) << "Cannot query the size of workspace needed "
-                           "for the specified algorithm: "
-                        << algorithm_config.algorithm().algo_id() << " "
-                        << ToString(status);
-    // Silently return when we are profiling.
-    return dnn::AlgorithmDesc();
+
+  auto scratch_or = AllocateCudnnConvolutionBackwardFilterWorkspace(
+      stream, cudnn, algo_desc, input_nd, filter, conv, output_nd,
+      scratch_allocator);
+
+  if (scratch_or.ok()) {
+    *scratch = scratch_or.ValueOrDie();
+    return algo_desc;
   }
-  if (TF_PREDICT_FALSE(size_in_bytes_int64 < 0)) {
-    LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                    "negative sizeInBytes value. This could be a cudnn bug.";
-    if (TF_PREDICT_TRUE(is_profiling)) {
-      return dnn::AlgorithmDesc();
-    }
-  } else if (size_in_bytes_int64 > 0) {
-    port::StatusOr<DeviceMemory<uint8>> allocated;
-    if (TF_PREDICT_TRUE(scratch_allocator)) {
-      allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (TF_PREDICT_TRUE(allocated.ok())) {
-        *scratch = allocated.ValueOrDie();
-      } else {
-        if (TF_PREDICT_TRUE(is_profiling)) {
-          // Silently return when we are profiling.
-          return dnn::AlgorithmDesc();
-        }
-        LOG(WARNING) << allocated.status().error_message();
-        // For the int8 case, we fail at this point since the no_scratch
-        // algorithm should be set to dnn::kDefaultAlgorithm.
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-      }
-    }
-    if (TF_PREDICT_FALSE(!allocated.ok())) {
-      if (algorithm_config.algorithm_no_scratch().is_default()) {
-        use_tensor_ops = true;
-        algo = GetCudnnConvolutionForwardAlgo(
-            cudnn, input_nd, filter, conv, output_nd,
-            /*specify_workspace_limit=*/false, 0);
-      } else {
-        use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-        algo = ToConvForwardAlgo(algorithm_config.algorithm_no_scratch());
-      }
-    }
+
+  // Failed to allocate workspace for the first algorithm, fall back to the
+  // no_scratch algorithm.
+  if (algorithm_config.algorithm_no_scratch().is_default()) {
+    return port::Status(
+        port::error::INVALID_ARGUMENT,
+        "The primary convolution algorithm failed memory allocation, "
+        "while a secondary algorithm is not provided.");
   }
 
-  return dnn::AlgorithmDesc(algo, use_tensor_ops);
+  SE_ASSIGN_OR_RETURN(*scratch,
+                      AllocateCudnnConvolutionBackwardFilterWorkspace(
+                          stream, cudnn, algorithm_config.algorithm(), input_nd,
+                          filter, conv, output_nd, scratch_allocator));
+  return algorithm_config.algorithm_no_scratch();
 }
 
 // A helper class to set env-vars and choose options for cudnn-related
@@ -2282,8 +2350,6 @@ struct RnnDoFP32ComputationFP16Input {
   static constexpr bool kDefaultFlag = false;
 };
 
-// A helper function to return the internal compute type for
-// RNNs in cudnn.
 cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
   switch (data_type) {
     case dnn::DataType::kFloat:
@@ -2304,7 +2370,7 @@ cudnnDataType_t GetRnnComputeType(dnn::DataType data_type) {
 }  // namespace
 
 template <class T>
-bool CudnnSupport::DoConvolveImpl(
+port::Status CudnnSupport::DoConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -2334,177 +2400,48 @@ bool CudnnSupport::DoConvolveImpl(
                                                : static_cast<void*>(&fbeta);
 
   const bool is_profiling = output_profile_result != nullptr;
-  cudnnConvolutionFwdAlgo_t algo;
-  bool use_tensor_ops;
-  DeviceMemory<uint8> scratch;
-
-  // TODO(pauldonnelly): Replace the following code with a call to
-  //   GetCudnnConvolutionForwardAlgorithm().
-  if (algorithm_config.algorithm().is_default()) {
-    // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm = [&](bool specify_limit) {
-      cudnnConvolutionFwdPreference_t preference =
-          specify_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
-                        : CUDNN_CONVOLUTION_FWD_NO_WORKSPACE;
-
-      auto memory_limit_bytes =
-          scratch_allocator == nullptr
-              ? 0
-              : scratch_allocator->GetMemoryLimitInBytes(stream);
-      if (memory_limit_bytes < 0) {
-        memory_limit_bytes = 0;
-      }
-
-      cudnnConvolutionFwdAlgo_t algo_to_use;
-      auto status = cudnnGetConvolutionForwardAlgorithm(
-          cudnn.handle(), input_nd.handle(), filter.handle(), conv.handle(),
-          output_nd.handle(),
-          /*preference=*/preference,
-          /*memoryLimitInBytes=*/memory_limit_bytes,
-          /*algo=*/&algo_to_use);
-      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
-                                                "algorithm for doing forward "
-                                                "convolution";
-      return algo_to_use;
-    };
 
-    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-    use_tensor_ops = true;
-    if (scratch_allocator != nullptr) {
-      size_t size_in_bytes;
-      auto status = cudnnGetConvolutionForwardWorkspaceSize(
-          cudnn.handle(),
-          /*xDesc=*/input_nd.handle(),
-          /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-          /*yDesc=*/output_nd.handle(), /*algo=*/algo,
-          /*sizeInBytes=*/&size_in_bytes);
-      int64 size_in_bytes_int64 = size_in_bytes;
-      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
-        if (size_in_bytes_int64 > 0) {
-          auto allocated =
-              scratch_allocator->AllocateBytes(stream, size_in_bytes);
-          if (allocated.ok()) {
-            scratch = allocated.ValueOrDie();
-          } else {
-            LOG(WARNING) << allocated.status().error_message();
-          }
-        } else {
-          LOG(WARNING)
-              << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                 "negative sizeInBytes value. This could be a cudnn bug.";
-        }
-      }
-    }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
+                      GetCudnnConvolutionForwardAlgorithm(
+                          stream, cudnn, algorithm_config, input_nd, filter,
+                          conv, output_nd, scratch_allocator, &scratch));
 
-    // If we didn't allocate any scratch space (perhaps because of failed
-    // allocation), we force a switch back to the "no workspace" algorithm.
-    if (scratch == nullptr) {
-      algo = get_algorithm(/*specify_limit=*/false);
-    }
-  } else {
-    // An algorithm has been specified.
-    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
-    algo = ToConvForwardAlgo(algotype);
-    use_tensor_ops = algotype.tensor_ops_enabled();
-    conv.set_use_tensor_op_math(use_tensor_ops);
-    size_t size_in_bytes;
-    auto status = cudnnGetConvolutionForwardWorkspaceSize(
-        cudnn.handle(),
-        /*xDesc=*/input_nd.handle(),
-        /*wDesc=*/filter.handle(), /*convDesc=*/conv.handle(),
-        /*yDesc=*/output_nd.handle(), /*algo=*/algo,
-        /*sizeInBytes=*/&size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
-                    "algorithm: "
-                 << algorithm_config.algorithm().algo_id();
-    }
-    int64 size_in_bytes_int64 = size_in_bytes;
-    if (size_in_bytes_int64 > 0) {
-      if (scratch_allocator == nullptr) {
-        LOG(FATAL) << "An allocator must be specified when scratch memory is "
-                      "needed";
-      }
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (is_profiling && !allocated.ok()) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      if (allocated.ok()) {
-        scratch = allocated.ValueOrDie();
-      } else {
-        LOG(WARNING) << allocated.status().error_message();
-      }
-      if (scratch == nullptr) {
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
-        algo = ToConvForwardAlgo(algotype);
-        use_tensor_ops = algotype.tensor_ops_enabled();
-        conv.set_use_tensor_op_math(use_tensor_ops);
-      }
-    } else if (size_in_bytes_int64 < 0) {
-      LOG(WARNING) << "cudnnGetConvolutionForwardWorkspaceSize() returned "
-                      "negative sizeInBytes value. This could be a cudnn bug.";
-    }
-  }
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
-  auto status = cudnnConvolutionForward(
+
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
       cudnn.handle(),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
       /*srcData=*/input_data.opaque(), /*filterDesc=*/filter.handle(),
       /*filterData=*/filter_data.opaque(), /*convDesc=*/conv.handle(),
-      /*algo=*/algo, /*workSpace=*/scratch.opaque(),
+      /*algo=*/ToConvForwardAlgo(algo_desc), /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(), /*beta=*/beta,
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
-    }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    timer->Destroy();
-  }
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-    }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
 template <typename Type, typename BiasType, typename ScaleType,
           int cudnn_data_type, int cudnn_compute_type>
-bool CudnnSupport::DoFusedConvolveImpl(
+port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<Type>& conv_input_data, ScaleType conv_input_scale,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -2517,6 +2454,12 @@ bool CudnnSupport::DoFusedConvolveImpl(
     DeviceMemory<Type>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  if (activation_mode != dnn::ActivationMode::kRelu) {
+    return port::Status(port::error::INVALID_ARGUMENT,
+                        "cudnnConvolutionBiasActivationForward() only supports "
+                        "Relu activation.");
+  }
+
   ScopedTensorDescriptor conv_input_nd(
       conv_input_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
   ScopedTensorDescriptor output_nd(
@@ -2528,38 +2471,24 @@ bool CudnnSupport::DoFusedConvolveImpl(
       convolution_descriptor, static_cast<cudnnDataType_t>(cudnn_compute_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
+
   const bool is_profiling = output_profile_result != nullptr;
-  DeviceMemory<uint8> scratch;
-  dnn::AlgorithmDesc algotype = GetCudnnConvolutionForwardAlgorithm(
-      stream, cudnn, algorithm_config, is_profiling, conv_input_nd, filter,
-      conv, output_nd, scratch_allocator, &scratch);
-  if (algotype.is_default()) {
-    if (!is_profiling) {
-      LOG(ERROR) << "No suitable algorithm found";
-    }
-    return false;
-  }
-  auto algo = static_cast<cudnnConvolutionFwdAlgo_t>(algotype.algo_id());
-  conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
 
-  if (activation_mode != dnn::ActivationMode::kRelu) {
-    LOG(ERROR) << "cudnnConvolutionBiasActivationForward() only supports Relu "
-                  "activation.";
-    return false;
-  }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(
+      dnn::AlgorithmDesc algo_desc,
+      GetCudnnConvolutionForwardAlgorithm(
+          stream, cudnn, algorithm_config, conv_input_nd, filter, conv,
+          output_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    if (!timer->Init()) {
-      return false;
-    }
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    if (!timer->Start(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
     }
   }
   // CUDNN v6 only supports CUDNN_NOT_PROPAGATE_NAN as the reluNanOpt for
@@ -2576,7 +2505,8 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\nconv_input_data.opaque() = " << conv_input_data.opaque()
           << "\nfilter.handle() = " << filter.handle()
           << "\nfilter_data.opaque() = " << filter_data.opaque()
-          << "\nconv.handle() = " << conv.handle() << "\nalgo = " << algo
+          << "\nconv.handle() = " << conv.handle()
+          << "\nalgo = " << algo_desc.algo_id()
           << "\nscratch.opaque() = " << scratch.opaque()
           << "\nscratch.size() = " << scratch.size()
           << "\nside_input_scale = " << side_input_scale
@@ -2588,41 +2518,29 @@ bool CudnnSupport::DoFusedConvolveImpl(
           << "\noutput_nd.handle() = " << output_nd.handle()
           << "\noutput_data->opaque() = " << output_data->opaque();
 
-  auto status = cudnnConvolutionBiasActivationForward(
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBiasActivationForward(
       cudnn.handle(),
       /*alpha1=*/&conv_input_scale,
       /*srcDesc=*/conv_input_nd.handle(), /*srcData=*/conv_input_data.opaque(),
       /*filterDesc=*/filter.handle(), /*filterData=*/filter_data.opaque(),
-      /*convDesc=*/conv.handle(), algo, /*workSpace=*/scratch.opaque(),
+      /*convDesc=*/conv.handle(), ToConvForwardAlgo(algo_desc),
+      /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(), /*alpha2=*/&side_input_scale,
       /*zDesc=*/output_nd.handle(), /*z=*/side_input_data_ptr,
       /*biasDesc=*/bias_nd.handle(), /*bias=*/biases.opaque(),
       /*activationDesc=*/activation_desc.handle(),
-      /*destDesc=*/output_nd.handle(), /*destData=*/output_data->opaque());
+      /*yDesc=*/output_nd.handle(), /*y=*/output_data->opaque()));
 
   if (is_profiling) {
     if (!timer->Stop(AsCUDAStream(stream))) {
-      timer->Destroy();
-      return false;
-    }
-    if (status == CUDNN_STATUS_SUCCESS) {
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    timer->Destroy();
-  }
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
-    }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
 
-  return true;
+  return port::Status::OK();
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
@@ -2746,11 +2664,13 @@ bool CudnnSupport::DoBatchNormalizationForward(
     DeviceMemory<float>* saved_inv_var, bool is_training,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  return DoBatchNormalizationForwardImpl<float, float>(
-      stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
-      batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
-      std::move(var_to_inv_var), std::move(inv_var_to_var));
+  return IsStatusOk(
+      DoBatchNormalizationForwardImpl<float, float>(
+          stream, dnn::DataType::kFloat, dnn::DataType::kFloat, x, scale,
+          offset, estimated_mean, estimated_variance, x_desc, scale_offset_desc,
+          epsilon, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+          is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoBatchNormalizationForward(
@@ -2765,15 +2685,17 @@ bool CudnnSupport::DoBatchNormalizationForward(
     DeviceMemory<float>* saved_inv_var, bool is_training,
     std::function<const DeviceMemory<float>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  return DoBatchNormalizationForwardImpl<Eigen::half, float>(
-      stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
-      estimated_mean, estimated_variance, x_desc, scale_offset_desc, epsilon, y,
-      batch_mean, batch_var, saved_mean, saved_inv_var, is_training,
-      std::move(var_to_inv_var), std::move(inv_var_to_var));
+  return IsStatusOk(
+      DoBatchNormalizationForwardImpl<Eigen::half, float>(
+          stream, dnn::DataType::kHalf, dnn::DataType::kFloat, x, scale, offset,
+          estimated_mean, estimated_variance, x_desc, scale_offset_desc,
+          epsilon, y, batch_mean, batch_var, saved_mean, saved_inv_var,
+          is_training, std::move(var_to_inv_var), std::move(inv_var_to_var)),
+      /*report_error=*/true);
 }
 
 template <class T, class U>
-bool CudnnSupport::DoBatchNormalizationForwardImpl(
+port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
     Stream* stream, dnn::DataType input_data_type,
     dnn::DataType scale_data_type, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -2798,7 +2720,6 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
   float zero = 0.0;
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  auto status = CUDNN_STATUS_SUCCESS;
   if (is_training) {
     CHECK_EQ(batch_mean->is_null(), batch_var->is_null())
         << "batch_mean and batch_var must both be null or both be non-null";
@@ -2815,26 +2736,21 @@ bool CudnnSupport::DoBatchNormalizationForwardImpl(
       batch_var_opaque = nullptr;
     }
 
-    status = cudnnBatchNormalizationForwardTraining(
+    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardTraining(
         cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
         x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
         scale.opaque(), offset.opaque(), 1.0, batch_mean_opaque,
         batch_var_opaque, epsilon, saved_mean->opaque(),
-        saved_inv_var->opaque());
+        saved_inv_var->opaque()));
   } else {
     const void* maybe_inv_var = estimated_variance.opaque();
-    status = cudnnBatchNormalizationForwardInference(
+    RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationForwardInference(
         cudnn.handle(), mode, &one, &zero, x_descriptor.handle(), x.opaque(),
         x_descriptor.handle(), y->opaque(), scale_offset_descriptor.handle(),
         scale.opaque(), offset.opaque(), estimated_mean.opaque(), maybe_inv_var,
-        epsilon);
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward batch normalization on stream: "
-               << ToString(status);
-    return false;
+        epsilon));
   }
-  return true;
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoBatchNormalizationBackward(
@@ -2845,10 +2761,11 @@ bool CudnnSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
-  return DoBatchNormalizationBackwardImpl(
-      stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+  return IsStatusOk(DoBatchNormalizationBackwardImpl(
+                        stream, CUDNN_DATA_FLOAT, CUDNN_DATA_FLOAT, y_backprop,
+                        x, scale, mean, inv_var, x_desc, scale_offset_desc,
+                        epsilon, x_backprop, scale_backprop, offset_backprop),
+                    /*report_error=*/true);
 }
 
 bool CudnnSupport::DoBatchNormalizationBackward(
@@ -2859,14 +2776,15 @@ bool CudnnSupport::DoBatchNormalizationBackward(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<Eigen::half>* x_backprop, DeviceMemory<float>* scale_backprop,
     DeviceMemory<float>* offset_backprop) {
-  return DoBatchNormalizationBackwardImpl(
-      stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop, x, scale, mean,
-      inv_var, x_desc, scale_offset_desc, epsilon, x_backprop, scale_backprop,
-      offset_backprop);
+  return IsStatusOk(DoBatchNormalizationBackwardImpl(
+                        stream, CUDNN_DATA_HALF, CUDNN_DATA_FLOAT, y_backprop,
+                        x, scale, mean, inv_var, x_desc, scale_offset_desc,
+                        epsilon, x_backprop, scale_backprop, offset_backprop),
+                    /*report_error=*/true);
 }
 
 template <class T, class U>
-bool CudnnSupport::DoBatchNormalizationBackwardImpl(
+port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
     Stream* stream, int cudnn_input_type, int cudnn_scale_type,
     const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
     const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
@@ -2889,19 +2807,14 @@ bool CudnnSupport::DoBatchNormalizationBackwardImpl(
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  auto status = cudnnBatchNormalizationBackward(
+  RETURN_IF_CUDNN_ERROR(cudnnBatchNormalizationBackward(
       cudnn.handle(), mode, &one, &zero, &one, &zero, x_descriptor.handle(),
       x.opaque(), x_descriptor.handle(), y_backprop.opaque(),
       x_descriptor.handle(), x_backprop->opaque(),
       scale_offset_descriptor.handle(), scale.opaque(),
       scale_backprop->opaque(), offset_backprop->opaque(), epsilon,
-      mean.opaque(), inv_var.opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward batch normalization on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+      mean.opaque(), inv_var.opaque()));
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolve(
@@ -2914,10 +2827,12 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveImpl<float>(
-      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveImpl<float>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data,
+          scratch_allocator, algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
@@ -2930,10 +2845,12 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveImpl<double>(
-      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveImpl<double>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data,
+          scratch_allocator, algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolve(
@@ -2946,10 +2863,12 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveImpl<Eigen::half>(
-      stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-      convolution_descriptor, output_descriptor, output_data, scratch_allocator,
-      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveImpl<Eigen::half>(
+          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
+          convolution_descriptor, output_descriptor, output_data,
+          scratch_allocator, algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -2965,13 +2884,15 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<double>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<double, double, double, CUDNN_DATA_DOUBLE,
-                             CUDNN_DATA_DOUBLE>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<double, double, double, CUDNN_DATA_DOUBLE,
+                          CUDNN_DATA_DOUBLE>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -2987,13 +2908,15 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<float>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<float, float, float, CUDNN_DATA_FLOAT,
-                             CUDNN_DATA_FLOAT>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<float, float, float, CUDNN_DATA_FLOAT,
+                          CUDNN_DATA_FLOAT>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3010,13 +2933,15 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoFusedConvolveImpl<Eigen::half, Eigen::half, float, CUDNN_DATA_HALF,
-                             CUDNN_DATA_FLOAT>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<Eigen::half, Eigen::half, float, CUDNN_DATA_HALF,
+                          CUDNN_DATA_FLOAT>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoFusedConvolve(
@@ -3040,13 +2965,15 @@ bool CudnnSupport::DoFusedConvolve(
                     "supported on GPUs with compute capability 6.1 or later.";
     return false;
   }
-  return DoFusedConvolveImpl<int8, float, float, CUDNN_DATA_INT8x4,
-                             CUDNN_DATA_INT32>(
-      stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-      filter_descriptor, filter_data, convolution_descriptor, side_input_data,
-      side_input_scale, bias_descriptor, biases, activation_mode,
-      output_descriptor, output_data, scratch_allocator, algorithm_config,
-      output_profile_result);
+  return IsStatusOk(
+      DoFusedConvolveImpl<int8, float, float, CUDNN_DATA_INT8x4,
+                          CUDNN_DATA_INT32>(
+          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
+          filter_descriptor, filter_data, convolution_descriptor,
+          side_input_data, side_input_scale, bias_descriptor, biases,
+          activation_mode, output_descriptor, output_data, scratch_allocator,
+          algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoTransformTensor(Stream* stream,
@@ -3062,22 +2989,17 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
   ScopedTensorDescriptor output_tensor_desc(
       output_desc, ToCudnnDataType(output_type, output_desc.layout()));
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnTransformTensor(
-      cudnn.handle(), &scale, input_tensor_desc.handle(), input_data.opaque(),
-      &beta, output_tensor_desc.handle(), output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "Could not transform a tensor with layout "
-               << input_desc.ToString() << " and data type "
-               << static_cast<int>(input_type) << " to another with layout "
-               << output_desc.ToString() << " and data type "
-               << static_cast<int>(output_type) << ": " << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnTransformTensor(
+        cudnn.handle(), &scale, input_tensor_desc.handle(), input_data.opaque(),
+        &beta, output_tensor_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 template <class T>
-bool CudnnSupport::DoConvolveBackwardDataImpl(
+port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
     const DeviceMemory<T>& filter_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3108,139 +3030,25 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
                                    GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
-  cudnnConvolutionBwdDataAlgo_t algo;
-  DeviceMemory<uint8> scratch;
-
-  if (algorithm_config.algorithm().is_default()) {
-    // With the default algorithm, use Cudnn's heuristics.
-    auto get_algorithm =
-        [&](bool specify_limit) -> cudnnConvolutionBwdDataAlgo_t {
-      cudnnConvolutionBwdDataPreference_t preference =
-          specify_limit ? CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT
-                        : CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE;
-
-      auto memory_limit_bytes =
-          scratch_allocator == nullptr
-              ? 0
-              : scratch_allocator->GetMemoryLimitInBytes(stream);
-      if (memory_limit_bytes < 0) {
-        memory_limit_bytes = 0;
-      }
-      cudnnConvolutionBwdDataAlgo_t algo_to_use;
-      cudnnStatus_t status = cudnnGetConvolutionBackwardDataAlgorithm(
-          cudnn.handle(),
-          /*filterDesc=*/filter.handle(),
-          /*diffDesc=*/out_back_nd.handle(),
-          /*convDesc=*/conv.handle(),
-          /*gradDesc=*/in_back_nd.handle(),
-          /*preference=*/preference,
-          /*memoryLimitInBytes=*/memory_limit_bytes,
-          /*algo=*/&algo_to_use);
-      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
-                                                "algorithm for doing backward "
-                                                "data convolution";
-      return algo_to_use;
-    };
-
-    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-
-    if (scratch_allocator != nullptr) {
-      size_t size_in_bytes;
-      auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
-          cudnn.handle(),
-          /*filterDesc=*/filter.handle(),
-          /*diffDesc=*/out_back_nd.handle(),
-          /*convDesc=*/conv.handle(),
-          /*gradDesc=*/in_back_nd.handle(),
-          /*algo=*/algo,
-          /*sizeInBytes=*/&size_in_bytes);
-      int64 size_in_bytes_int64 = size_in_bytes;
-      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
-        if (size_in_bytes_int64 > 0) {
-          auto allocated =
-              scratch_allocator->AllocateBytes(stream, size_in_bytes);
-          if (allocated.ok()) {
-            scratch = allocated.ValueOrDie();
-          } else {
-            LOG(WARNING) << allocated.status().error_message();
-          }
-        } else {
-          LOG(WARNING)
-              << "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
-                 "negative sizeInBytes value. This could be a cudnn bug.";
-        }
-      }
-    }
 
-    // If we didn't allocate any scratch space (perhaps because of failed
-    // allocation), we force a switch back to the "no workspace" algorithm.
-    if (scratch == nullptr) {
-      algo = get_algorithm(/*specify_limit=*/false);
-    }
-  } else {
-    // An algorithm has been specified.
-    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
-    algo = ToConvBackwardDataAlgo(algotype);
-    conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-    size_t size_in_bytes;
-    auto status = cudnnGetConvolutionBackwardDataWorkspaceSize(
-        cudnn.handle(),
-        /*filterDesc=*/filter.handle(),
-        /*diffDesc=*/out_back_nd.handle(),
-        /*convDesc=*/conv.handle(),
-        /*gradDesc=*/in_back_nd.handle(),
-        /*algo=*/algo,
-        /*sizeInBytes=*/&size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
-                    "algorithm: "
-                 << algorithm_config.algorithm().algo_id();
-    }
-    int64 size_in_bytes_int64 = size_in_bytes;
-    if (size_in_bytes_int64 > 0) {
-      if (scratch_allocator == nullptr) {
-        LOG(FATAL) << "An allocator must be specified when scratch memory is "
-                      "needed";
-      }
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (is_profiling && !allocated.ok()) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      if (allocated.ok()) {
-        scratch = allocated.ValueOrDie();
-      } else {
-        LOG(WARNING) << allocated.status().error_message();
-      }
-      if (scratch == nullptr) {
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
-        algo = ToConvBackwardDataAlgo(algotype);
-        conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-      }
-    } else if (size_in_bytes_int64 < 0) {
-      LOG(WARNING) << "cudnnGetConvolutionBackwardDataWorkspaceSize() returned "
-                      "negative sizeInBytes value. This could be a cudnn bug.";
-    }
-  }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
+                      GetCudnnConvolutionBackwardDataAlgorithm(
+                          stream, cudnn, algorithm_config, in_back_nd, filter,
+                          conv, out_back_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    timer->Init();
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    timer->Start(AsCUDAStream(stream));
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
   }
 
-  auto status =
+  RETURN_IF_CUDNN_ERROR(
       cudnnConvolutionBackwardData(cudnn.handle(),
                                    /*alpha=*/alpha,
                                    /*wDesc=*/filter.handle(),
@@ -3248,32 +3056,22 @@ bool CudnnSupport::DoConvolveBackwardDataImpl(
                                    /*dyDesc=*/out_back_nd.handle(),
                                    /*dy=*/backward_output_data.opaque(),
                                    /*convDesc=*/conv.handle(),
-                                   /*algo=*/algo,
+                                   /*algo=*/ToConvBackwardDataAlgo(algo_desc),
                                    /*workSpace=*/scratch.opaque(),
                                    /*workSpaceSizeInBytes=*/scratch.size(),
                                    /*beta=*/beta,
                                    /*dxDesc=*/in_back_nd.handle(),
-                                   /*dx=*/backward_input_data->opaque());
+                                   /*dx=*/backward_input_data->opaque()));
   if (is_profiling) {
-    timer->Stop(AsCUDAStream(stream));
-    if (status == CUDNN_STATUS_SUCCESS) {
-      bool use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
-  return true;
+
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
@@ -3287,11 +3085,13 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor, backward_output_data,
-                                    convolution_descriptor, input_descriptor,
-                                    backward_input_data, scratch_allocator,
-                                    algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, scratch_allocator,
+                                 algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
@@ -3305,11 +3105,13 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor, backward_output_data,
-                                    convolution_descriptor, input_descriptor,
-                                    backward_input_data, scratch_allocator,
-                                    algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, scratch_allocator,
+                                 algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardData(
@@ -3323,15 +3125,17 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                    output_descriptor, backward_output_data,
-                                    convolution_descriptor, input_descriptor,
-                                    backward_input_data, scratch_allocator,
-                                    algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
+                                 output_descriptor, backward_output_data,
+                                 convolution_descriptor, input_descriptor,
+                                 backward_input_data, scratch_allocator,
+                                 algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 template <class T>
-bool CudnnSupport::DoConvolveBackwardFilterImpl(
+port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& output_descriptor,
@@ -3362,141 +3166,25 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
                                    GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
-  cudnnConvolutionBwdFilterAlgo_t algo;
-  DeviceMemory<uint8> scratch;
-
-  if (algorithm_config.algorithm().is_default()) {
-    // With the default algorithm, use Cudnn's heuristics.
-
-    // Lambda that retrieves the algorithm.
-    // specify_limit will occur when we have a scratch allocator and it succeeds
-    // in allocating; otherwise, we'll fall back to the "no workspace" version.
-    auto get_algorithm = [&](bool specify_limit) {
-      cudnnConvolutionBwdFilterPreference_t preference =
-          specify_limit ? CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT
-                        : CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE;
-
-      auto memory_limit_bytes =
-          scratch_allocator == nullptr
-              ? 0
-              : scratch_allocator->GetMemoryLimitInBytes(stream);
-      if (memory_limit_bytes < 0) {
-        memory_limit_bytes = 0;
-      }
-
-      cudnnConvolutionBwdFilterAlgo_t algo_to_use;
-      cudnnStatus_t status = cudnnGetConvolutionBackwardFilterAlgorithm(
-          cudnn.handle(),
-          /*srcDesc=*/input_nd.handle(),
-          /*diffDesc=*/out_back_nd.handle(),
-          /*convDesc=*/conv.handle(),
-          /*gradDesc=*/filter.handle(),
-          /*preference=*/preference,
-          /*memoryLimitInBytes=*/memory_limit_bytes,
-          /*algo=*/&algo_to_use);
-      CHECK_EQ(status, CUDNN_STATUS_SUCCESS) << "Unable to find a suitable "
-                                                "algorithm for doing backward "
-                                                "filter convolution";
-      return algo_to_use;
-    };
-
-    algo = get_algorithm(/*specify_limit=*/scratch_allocator != nullptr);
-
-    if (scratch_allocator != nullptr) {
-      size_t size_in_bytes;
-      auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-          cudnn.handle(),
-          /*xDesc=*/input_nd.handle(),
-          /*dyDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
-          /*gradDesc=*/filter.handle(), /*algo=*/algo,
-          /*sizeInBytes=*/&size_in_bytes);
-      int64 size_in_bytes_int64 = size_in_bytes;
-      if (status == CUDNN_STATUS_SUCCESS && size_in_bytes_int64 != 0) {
-        if (size_in_bytes_int64 > 0) {
-          auto allocated =
-              scratch_allocator->AllocateBytes(stream, size_in_bytes);
-          if (allocated.ok()) {
-            scratch = allocated.ValueOrDie();
-          } else {
-            LOG(WARNING) << allocated.status().error_message();
-          }
-        } else {
-          LOG(WARNING)
-              << "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
-                 "negative sizeInBytes value. This could be a cudnn bug.";
-        }
-      }
-    }
 
-    // If we didn't allocate any scratch space (perhaps because of failed
-    // allocation), we force a switch back to the "no workspace" algorithm.
-    if (scratch == nullptr) {
-      algo = get_algorithm(/*specify_limit=*/false);
-    }
-  } else {
-    // An algorithm has been specified.
-    dnn::AlgorithmDesc algotype = algorithm_config.algorithm();
-    algo = ToConvBackwardFilterAlgo(algotype);
-    conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-
-    size_t size_in_bytes;
-    auto status = cudnnGetConvolutionBackwardFilterWorkspaceSize(
-        cudnn.handle(),
-        /*xDesc=*/input_nd.handle(),
-        /*dyDesc=*/out_back_nd.handle(), /*convDesc=*/conv.handle(),
-        /*gradDesc=*/filter.handle(), /*algo=*/algo,
-        /*sizeInBytes=*/&size_in_bytes);
-    if (status != CUDNN_STATUS_SUCCESS) {
-      if (is_profiling) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      LOG(FATAL) << "Cannot query the size of workspace needed for the given "
-                    "algorithm: "
-                 << algorithm_config.algorithm().algo_id();
-    }
-    int64 size_in_bytes_int64 = size_in_bytes;
-    if (size_in_bytes_int64 > 0) {
-      if (scratch_allocator == nullptr) {
-        LOG(FATAL) << "An allocator must be specified when scratch memory is "
-                      "needed";
-      }
-      auto allocated = scratch_allocator->AllocateBytes(stream, size_in_bytes);
-      if (is_profiling && !allocated.ok()) {
-        // Silently return when we are profiling.
-        return false;
-      }
-      if (allocated.ok()) {
-        scratch = allocated.ValueOrDie();
-      } else {
-        LOG(WARNING) << allocated.status().error_message();
-      }
-      if (scratch == nullptr) {
-        CHECK(!algorithm_config.algorithm_no_scratch().is_default())
-            << "The primary convolution algorithm failed memory allocation, "
-               "while a secondary algorithm is not provided.";
-        dnn::AlgorithmDesc algotype = algorithm_config.algorithm_no_scratch();
-        algo = ToConvBackwardFilterAlgo(algotype);
-        conv.set_use_tensor_op_math(algotype.tensor_ops_enabled());
-      }
-    } else if (size_in_bytes_int64 < 0) {
-      LOG(WARNING)
-          << "cudnnGetConvolutionBackwardFilterWorkspaceSize() returned "
-             "negative sizeInBytes value. This could be a cudnn bug.";
-    }
-  }
+  DeviceMemory<uint8> scratch;
+  SE_ASSIGN_OR_RETURN(dnn::AlgorithmDesc algo_desc,
+                      GetCudnnConvolutionBackwardFilterAlgorithm(
+                          stream, cudnn, algorithm_config, input_nd, filter,
+                          conv, out_back_nd, scratch_allocator, &scratch));
 
-  std::unique_ptr<CUDATimer> timer;
+  std::unique_ptr<CUDATimer, TimerDeleter> timer;
   if (is_profiling) {
     timer.reset(new CUDATimer(parent_));  // NOLINT
-    timer->Init();
     // The start and stop of the timer should be as close to the Cudnn call as
     // possible. It is still possible for other threads to issue workload on
     // to this stream. So it could take multiple profiling measurements.
-    timer->Start(AsCUDAStream(stream));
+    if (!timer->Init() || !timer->Start(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to start timer");
+    }
   }
 
-  auto status = cudnnConvolutionBackwardFilter(
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
       cudnn.handle(),
       /*alpha=*/alpha,
       /*srcDesc=*/input_nd.handle(),
@@ -3504,33 +3192,22 @@ bool CudnnSupport::DoConvolveBackwardFilterImpl(
       /*diffDesc=*/out_back_nd.handle(),
       /*diffData=*/backward_output_data.opaque(),
       /*convDesc=*/conv.handle(),
-      /*algo=*/algo,
+      /*algo=*/ToConvBackwardFilterAlgo(algo_desc),
       /*workSpace=*/scratch.opaque(),
       /*workSpaceSizeInBytes=*/scratch.size(),
       /*beta=*/beta,
       /*gradDesc=*/filter.handle(),
-      /*gradData=*/backward_filter_data->opaque());
-
+      /*dw=*/backward_filter_data->opaque()));
   if (is_profiling) {
-    timer->Stop(AsCUDAStream(stream));
-    if (status == CUDNN_STATUS_SUCCESS) {
-      bool use_tensor_ops = algorithm_config.algorithm().tensor_ops_enabled();
-      dnn::AlgorithmDesc algotype(algo, use_tensor_ops);
-      output_profile_result->set_algorithm(algotype);
-      output_profile_result->set_elapsed_time_in_ms(
-          timer->GetElapsedMilliseconds());
-    }
-    timer->Destroy();
-  }
-  if (status != CUDNN_STATUS_SUCCESS) {
-    // Silently return when we are profiling.
-    if (!is_profiling) {
-      LOG(ERROR) << "failed to enqueue convolution on stream: "
-                 << ToString(status);
+    if (!timer->Stop(AsCUDAStream(stream))) {
+      return port::Status(port::error::INTERNAL, "Failed to stop timer");
     }
-    return false;
+    output_profile_result->set_algorithm(algo_desc);
+    output_profile_result->set_elapsed_time_in_ms(
+        timer->GetElapsedMilliseconds());
   }
-  return true;
+
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3544,11 +3221,13 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                      output_descriptor, backward_output_data,
-                                      convolution_descriptor, filter_descriptor,
-                                      backward_filter_data, scratch_allocator,
-                                      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                   output_descriptor, backward_output_data,
+                                   convolution_descriptor, filter_descriptor,
+                                   backward_filter_data, scratch_allocator,
+                                   algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3562,11 +3241,13 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                      output_descriptor, backward_output_data,
-                                      convolution_descriptor, filter_descriptor,
-                                      backward_filter_data, scratch_allocator,
-                                      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                   output_descriptor, backward_output_data,
+                                   convolution_descriptor, filter_descriptor,
+                                   backward_filter_data, scratch_allocator,
+                                   algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3580,15 +3261,17 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                      output_descriptor, backward_output_data,
-                                      convolution_descriptor, filter_descriptor,
-                                      backward_filter_data, scratch_allocator,
-                                      algorithm_config, output_profile_result);
+  return IsStatusOk(
+      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
+                                   output_descriptor, backward_output_data,
+                                   convolution_descriptor, filter_descriptor,
+                                   backward_filter_data, scratch_allocator,
+                                   algorithm_config, output_profile_result),
+      /*report_error=*/!output_profile_result);
 }
 
 template <class T>
-bool CudnnSupport::DoConvolveBackwardBiasImpl(
+port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     Stream* stream, const dnn::BatchDescriptor& input_descriptor,
     const DeviceMemory<T>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
@@ -3603,15 +3286,10 @@ bool CudnnSupport::DoConvolveBackwardBiasImpl(
   float beta = 0.0;
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnConvolutionBackwardBias(
+  RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardBias(
       cudnn.handle(), &alpha, input_nd.handle(), input_data.opaque(), &beta,
-      bias_nd.handle(), backward_bias_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward convolution on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+      bias_nd.handle(), backward_bias_data->opaque()));
+  return port::Status::OK();
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3619,8 +3297,10 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<double>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<double>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
-                                    bias_descriptor, backward_bias_data);
+  return IsStatusOk(
+      DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                 bias_descriptor, backward_bias_data),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3628,8 +3308,10 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<float>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<float>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
-                                    bias_descriptor, backward_bias_data);
+  return IsStatusOk(
+      DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                 bias_descriptor, backward_bias_data),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoConvolveBackwardBias(
@@ -3637,8 +3319,10 @@ bool CudnnSupport::DoConvolveBackwardBias(
     const DeviceMemory<Eigen::half>& input_data,
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<Eigen::half>* backward_bias_data) {
-  return DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
-                                    bias_descriptor, backward_bias_data);
+  return IsStatusOk(
+      DoConvolveBackwardBiasImpl(stream, input_descriptor, input_data,
+                                 bias_descriptor, backward_bias_data),
+      /*report_error=*/true);
 }
 
 bool CudnnSupport::DoMatMul(Stream* stream,
@@ -3810,16 +3494,13 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  auto status = cudnnAddTensor(
-      cudnn.handle(), &alpha, bias_descriptor.handle(), biases.opaque(), &beta,
-      input_descriptor.handle(), output_data->opaque());
-
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "stream " << stream << " could not enqueue bias addition.";
-    return false;
-  }
-
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnAddTensor(
+        cudnn.handle(), &alpha, bias_descriptor.handle(), biases.opaque(),
+        &beta, input_descriptor.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoActivate(Stream* stream,
@@ -3838,16 +3519,13 @@ bool CudnnSupport::DoActivate(Stream* stream,
   float beta = 0.0;
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnActivationForward(
-      cudnn.handle(), activation_desc.handle(), &alpha, input_nd.handle(),
-      input_data.opaque(), &beta, input_nd.handle(), output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "stream " << stream
-               << " could not enqueue activation: " << ToString(status);
-    return false;
-  }
-
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnActivationForward(
+        cudnn.handle(), activation_desc.handle(), &alpha, input_nd.handle(),
+        input_data.opaque(), &beta, input_nd.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolForward(
@@ -3866,15 +3544,13 @@ bool CudnnSupport::DoPoolForward(
   ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnPoolingForward(
-      cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolForward(
@@ -3893,15 +3569,13 @@ bool CudnnSupport::DoPoolForward(
   ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnPoolingForward(
-      cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolForward(
@@ -3919,15 +3593,13 @@ bool CudnnSupport::DoPoolForward(
   ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
   ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnPoolingForward(
-      cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
-      input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue forward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, src_desc.handle(),
+        input_data.opaque(), &beta, dest_desc.handle(), output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolBackward(
@@ -3948,17 +3620,15 @@ bool CudnnSupport::DoPoolBackward(
   ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnPoolingBackward(
-      cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+        output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+        src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+        output_diff_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolBackward(
@@ -3979,17 +3649,15 @@ bool CudnnSupport::DoPoolBackward(
   ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnPoolingBackward(
-      cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+        output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+        src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+        output_diff_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoPoolBackward(
@@ -4010,17 +3678,15 @@ bool CudnnSupport::DoPoolBackward(
   ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnPoolingBackward(
-      cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
-      output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
-      src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
-      output_diff_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to enqueue backward pooling on stream: "
-               << ToString(status);
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnPoolingBackward(
+        cudnn.handle(), pooling_desc.handle(), &alpha, dest_desc.handle(),
+        output_data.opaque(), dest_desc.handle(), input_diff_data.opaque(),
+        src_desc.handle(), input_data.opaque(), &beta, src_desc.handle(),
+        output_diff_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoNormalize(
@@ -4055,15 +3721,14 @@ bool CudnnSupport::DoNormalizeWithDimensions(
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   // Launch the normalization.
-  auto status = cudnnLRNCrossChannelForward(
-      cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha,
-      dims.handle(), input_data.opaque(), &beta, dims.handle(),
-      output_data->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to run cudnnLRNCrossChannelForward";
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnLRNCrossChannelForward(
+        cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+        &alpha, dims.handle(), input_data.opaque(), &beta, dims.handle(),
+        output_data->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoNormalizeBackwardWithDimensions(
@@ -4089,16 +3754,15 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   float beta = 0.0f;
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
-  auto status = cudnnLRNCrossChannelBackward(
-      cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1, &alpha,
-      dims.handle(), normalized_data.opaque(), dims.handle(),
-      normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
-      &beta, dims.handle(), raw_variable_gradient->opaque());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "failed to run cudnnLRNCrossChannelBackward";
-    return false;
-  }
-  return true;
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnLRNCrossChannelBackward(
+        cudnn.handle(), normalize.handle(), CUDNN_LRN_CROSS_CHANNEL_DIM1,
+        &alpha, dims.handle(), normalized_data.opaque(), dims.handle(),
+        normalized_variable_gradient.opaque(), dims.handle(), raw_data.opaque(),
+        &beta, dims.handle(), raw_variable_gradient->opaque()));
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 bool CudnnSupport::DoDepthConcatenate(
@@ -4213,24 +3877,20 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
 
   int dn = batch_descriptor.ndims() + 2;
   std::vector<int> dims(dn);  // in BDYX
-  auto status = cudnnGetConvolutionNdForwardOutputDim(
-      conv.handle(), input_nd.handle(), filter.handle(), dn, dims.data());
-  if (status != CUDNN_STATUS_SUCCESS) {
-    LOG(ERROR) << "could not get output tensor for convolution: "
-               << ToString(status);
-    return false;
-  }
-
-  output_batch_descriptor->set_count(dims[0])
-      .set_feature_map_count(dims[1])
-      .set_layout(batch_descriptor.layout());
-
-  for (int i = 0; i < batch_descriptor.ndims(); i++) {
-    output_batch_descriptor->set_spatial_dim(static_cast<dnn::DimIndex>(i),
-                                             dims.rbegin()[i]);
-  }
+  auto status = [&] {
+    RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdForwardOutputDim(
+        conv.handle(), input_nd.handle(), filter.handle(), dn, dims.data()));
+    output_batch_descriptor->set_count(dims[0])
+        .set_feature_map_count(dims[1])
+        .set_layout(batch_descriptor.layout());
 
-  return true;
+    for (int i = 0; i < batch_descriptor.ndims(); i++) {
+      output_batch_descriptor->set_spatial_dim(static_cast<dnn::DimIndex>(i),
+                                               dims.rbegin()[i]);
+    }
+    return port::Status::OK();
+  }();
+  return IsStatusOk(status, /*report_error=*/true);
 }
 
 }  // namespace cuda
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index e2de3c62d8..c924d41cb5 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -631,7 +631,7 @@ class CudnnSupport : public dnn::DnnSupport {
   std::unique_ptr<class CudnnAccess> cudnn_;
 
   template <class T, class U>
-  bool DoBatchNormalizationForwardImpl(
+  port::Status DoBatchNormalizationForwardImpl(
       Stream* stream, dnn::DataType input_data_type,
       dnn::DataType scale_data_type, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& offset,
@@ -646,7 +646,7 @@ class CudnnSupport : public dnn::DnnSupport {
       std::function<void()> inv_var_to_var);
 
   template <class T, class U>
-  bool DoBatchNormalizationBackwardImpl(
+  port::Status DoBatchNormalizationBackwardImpl(
       Stream* stream, int cudnn_input_type, int cudnn_scale_type,
       const DeviceMemory<T>& y_backprop, const DeviceMemory<T>& x,
       const DeviceMemory<U>& scale, const DeviceMemory<U>& mean,
@@ -656,21 +656,20 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<U>* offset_backprop);
 
   template <class T>
-  bool DoConvolveImpl(Stream* stream,
-                      const dnn::BatchDescriptor& input_descriptor,
-                      const DeviceMemory<T>& input_data,
-                      const dnn::FilterDescriptor& filter_descriptor,
-                      const DeviceMemory<T>& filter_data,
-                      const dnn::ConvolutionDescriptor& convolution_descriptor,
-                      const dnn::BatchDescriptor& output_descriptor,
-                      DeviceMemory<T>* output_data,
-                      ScratchAllocator* scratch_allocator,
-                      const dnn::AlgorithmConfig& algorithm_config,
-                      dnn::ProfileResult* output_profile_result);
+  port::Status DoConvolveImpl(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::FilterDescriptor& filter_descriptor,
+      const DeviceMemory<T>& filter_data,
+      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const dnn::BatchDescriptor& output_descriptor,
+      DeviceMemory<T>* output_data, ScratchAllocator* scratch_allocator,
+      const dnn::AlgorithmConfig& algorithm_config,
+      dnn::ProfileResult* output_profile_result);
 
   template <typename Type, typename BiasType, typename ScaleType,
             int cudnn_data_type, int cudnn_compute_type>
-  bool DoFusedConvolveImpl(
+  port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<Type>& conv_input_data, ScaleType conv_input_scale,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -685,9 +684,8 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoConvolveBackwardDataImpl(
-      Stream* stream,
-      const dnn::FilterDescriptor& filter_descriptor,
+  port::Status DoConvolveBackwardDataImpl(
+      Stream* stream, const dnn::FilterDescriptor& filter_descriptor,
       const DeviceMemory<T>& filter_data,
       const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<T> backward_output_data,
@@ -698,10 +696,10 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoConvolveBackwardFilterImpl(
+  port::Status DoConvolveBackwardFilterImpl(
       Stream* stream, const dnn::BatchDescriptor& input_descriptor,
       const DeviceMemory<T>& input_data,
-      const dnn::BatchDescriptor& output_descriptor_in,
+      const dnn::BatchDescriptor& output_descriptor,
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
@@ -711,56 +709,56 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoConvolveBackwardBiasImpl(Stream* stream,
-                                  const dnn::BatchDescriptor& input_descriptor,
-                                  const DeviceMemory<T>& input_data,
-                                  const dnn::BatchDescriptor& bias_descriptor,
-                                  DeviceMemory<T>* backward_bias_data);
+  port::Status DoConvolveBackwardBiasImpl(
+      Stream* stream, const dnn::BatchDescriptor& input_descriptor,
+      const DeviceMemory<T>& input_data,
+      const dnn::BatchDescriptor& bias_descriptor,
+      DeviceMemory<T>* backward_bias_data);
 
   template <class T>
-  bool DoRnnForwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
-                        const CudnnRnnSequenceTensorDescriptor& input_desc,
-                        const DeviceMemory<T>& input_data,
-                        const CudnnRnnStateTensorDescriptor& input_h_desc,
-                        const DeviceMemory<T>& input_h_data,
-                        const CudnnRnnStateTensorDescriptor& input_c_desc,
-                        const DeviceMemory<T>& input_c_data,
-                        const DeviceMemory<T>& params,
-                        const CudnnRnnSequenceTensorDescriptor& output_desc,
-                        DeviceMemory<T>* output_data,
-                        const CudnnRnnStateTensorDescriptor& output_h_desc,
-                        DeviceMemory<T>* output_h_data,
-                        const CudnnRnnStateTensorDescriptor& output_c_desc,
-                        DeviceMemory<T>* output_c_data, bool is_training,
-                        ScratchAllocator* reserve_space_allocator,
-                        ScratchAllocator* workspace_allocator,
-                        dnn::ProfileResult* output_profile_result);
+  port::Status DoRnnForwardImpl(
+      Stream* stream, const CudnnRnnDescriptor& rnn_desc,
+      const CudnnRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const CudnnRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const CudnnRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const CudnnRnnSequenceTensorDescriptor& output_desc,
+      DeviceMemory<T>* output_data,
+      const CudnnRnnStateTensorDescriptor& output_h_desc,
+      DeviceMemory<T>* output_h_data,
+      const CudnnRnnStateTensorDescriptor& output_c_desc,
+      DeviceMemory<T>* output_c_data, bool is_training,
+      ScratchAllocator* reserve_space_allocator,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
 
   template <class T>
-  bool DoRnnBackwardImpl(Stream* stream, const CudnnRnnDescriptor& rnn_desc,
-                         const CudnnRnnSequenceTensorDescriptor& input_desc,
-                         const DeviceMemory<T>& input_data,
-                         const CudnnRnnStateTensorDescriptor& input_h_desc,
-                         const DeviceMemory<T>& input_h_data,
-                         const CudnnRnnStateTensorDescriptor& input_c_desc,
-                         const DeviceMemory<T>& input_c_data,
-                         const DeviceMemory<T>& params,
-                         const CudnnRnnSequenceTensorDescriptor& output_desc,
-                         const DeviceMemory<T>& output_data,
-                         const CudnnRnnStateTensorDescriptor& output_h_desc,
-                         const DeviceMemory<T>& output_h_data,
-                         const CudnnRnnStateTensorDescriptor& output_c_desc,
-                         const DeviceMemory<T>& output_c_data,
-                         const DeviceMemory<T>& output_backprop_data,
-                         const DeviceMemory<T>& output_h_backprop_data,
-                         const DeviceMemory<T>& output_c_backprop_data,
-                         DeviceMemory<T>* input_backprop_data,
-                         DeviceMemory<T>* input_h_backprop_data,
-                         DeviceMemory<T>* input_c_backprop_data,
-                         DeviceMemory<T>* params_backprop_data,
-                         DeviceMemory<uint8>* reserve_space_data,
-                         ScratchAllocator* workspace_allocator,
-                         dnn::ProfileResult* output_profile_result);
+  port::Status DoRnnBackwardImpl(
+      Stream* stream, const CudnnRnnDescriptor& rnn_desc,
+      const CudnnRnnSequenceTensorDescriptor& input_desc,
+      const DeviceMemory<T>& input_data,
+      const CudnnRnnStateTensorDescriptor& input_h_desc,
+      const DeviceMemory<T>& input_h_data,
+      const CudnnRnnStateTensorDescriptor& input_c_desc,
+      const DeviceMemory<T>& input_c_data, const DeviceMemory<T>& params,
+      const CudnnRnnSequenceTensorDescriptor& output_desc,
+      const DeviceMemory<T>& output_data,
+      const CudnnRnnStateTensorDescriptor& output_h_desc,
+      const DeviceMemory<T>& output_h_data,
+      const CudnnRnnStateTensorDescriptor& output_c_desc,
+      const DeviceMemory<T>& output_c_data,
+      const DeviceMemory<T>& output_backprop_data,
+      const DeviceMemory<T>& output_h_backprop_data,
+      const DeviceMemory<T>& output_c_backprop_data,
+      DeviceMemory<T>* input_backprop_data,
+      DeviceMemory<T>* input_h_backprop_data,
+      DeviceMemory<T>* input_c_backprop_data,
+      DeviceMemory<T>* params_backprop_data,
+      DeviceMemory<uint8>* reserve_space_data,
+      ScratchAllocator* workspace_allocator,
+      dnn::ProfileResult* output_profile_result);
 
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnSupport);
 };
diff --git a/tensorflow/stream_executor/cuda/cuda_timer.h b/tensorflow/stream_executor/cuda/cuda_timer.h
index 70554ec931..e040cf86fa 100644
--- a/tensorflow/stream_executor/cuda/cuda_timer.h
+++ b/tensorflow/stream_executor/cuda/cuda_timer.h
@@ -37,8 +37,9 @@ class CUDATimer : public internal::TimerInterface {
   explicit CUDATimer(CUDAExecutor *parent)
       : parent_(parent), start_event_(nullptr), stop_event_(nullptr) {}
 
-  // Note: teardown is explicitly handled in this API by a call to
+  // Note: teardown needs to be explicitly handled in this API by a call to
   // StreamExecutor::DeallocateTimer(), which invokes Destroy().
+  // TODO(csigg): Change to RAII.
   ~CUDATimer() override {}
 
   // Allocates the platform-specific pieces of the timer, called as part of
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 5315d1f3da..82aa8ceb32 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -141,6 +141,10 @@ string PadAlignmentString(PadAlignment alignment) {
   return "unknown pad alignment";
 }
 
+std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment) {
+  return str << PadAlignmentString(alignment);
+}
+
 string ShortPoolingModeString(PoolingMode mode) {
   switch (mode) {
     case PoolingMode::kMaximum:
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index 3df5365c23..9eca5abe1a 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -469,6 +469,9 @@ enum class PadAlignment : int64 {
 // Returns a string representation of the given padding alignment.
 string PadAlignmentString(PadAlignment alignment);
 
+// Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
+std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
+
 // Describes a convolution.
 //
 // Uses the named argument construction form:
@@ -710,7 +713,7 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(false) {}
+  AlgorithmDesc() : algo_(kDefaultAlgorithm), tensor_ops_enabled_(true) {}
   AlgorithmDesc(Index a, bool use_tensor_ops)
       : algo_(a), tensor_ops_enabled_(use_tensor_ops) {}
   bool is_default() const { return algo_ == kDefaultAlgorithm; }
-- 
GitLab


From 73e5438b725b46e745e6e910c6557b51a321c70f Mon Sep 17 00:00:00 2001
From: Xiaoqiang Zheng <zhengxq@google.com>
Date: Fri, 1 Jun 2018 00:30:10 -0700
Subject: [PATCH 157/610] Remove the constructor in shared memory.

PiperOrigin-RevId: 198837256
---
 tensorflow/core/kernels/conv_ops_gpu_3.cu.cc       |  8 +++++++-
 tensorflow/core/kernels/reduction_gpu_kernels.cu.h | 12 ++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
index a2e7342b04..a5fa48f85e 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
@@ -247,7 +247,13 @@ __global__ void SwapDimension1And2InTensor3UsingTiles(
   constexpr int ReadRowPerPass = NumThreads / TileSizeJ;
   constexpr int WriteRowPerPass = NumThreads / TileSizeI;
   // One extra line in the inner dimension to avoid share memory bank conflict.
-  __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+  // This is to mimic the following, but no constructor of T can be invoked.
+  //     __shared__ T shared_memory_tile[TileSizeI][TileSizeJ + 1];
+  __shared__ __align__(
+      alignof(T)) char shared_mem_raw[TileSizeI * (TileSizeJ + 1) * sizeof(T)];
+  typedef T(*SharedMemoryTile)[TileSizeJ + 1];
+  SharedMemoryTile shared_memory_tile =
+      reinterpret_cast<SharedMemoryTile>(shared_mem_raw);
 
   int x = threadIdx.x;
 
diff --git a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
index 0de2ebb590..6655084045 100644
--- a/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
+++ b/tensorflow/core/kernels/reduction_gpu_kernels.cu.h
@@ -295,7 +295,11 @@ __global__ void ColumnReduceMax16ColumnsKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ storage_type<value_type> partial_sums[32 * 33];
+  // This is the mimic the following, but without any constructors:
+  //   __shared__ storage_type<value_type> partial_sums[32 * 33];
+  __shared__ __align__(
+      alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)];
+  value_type* partial_sums = reinterpret_cast<value_type*>(partial_sums_raw);
 
   row += rows_per_warp * gridDim.y * blockDim.y;
   for (; row < num_rows; row += rows_per_warp * gridDim.y * blockDim.y) {
@@ -344,7 +348,11 @@ __global__ void ColumnReduceKernel(
 
   // 1D array necessary due to bug in CUDA 9 compiler.
   // TODO(nluehr) revert to 2D array when compiler is ready.
-  __shared__ storage_type<value_type> partial_sums[32 * 33];
+  // This is to mimic the following, but without constructors:
+  //     __shared__ storage_type<value_type> partial_sums[32 * 33];
+  __shared__ __align__(
+      alignof(value_type)) char partial_sums_raw[32 * 33 * sizeof(value_type)];
+  value_type* partial_sums = reinterpret_cast<value_type*>(partial_sums_raw);
 
   row += gridDim.y * blockDim.y;
 
-- 
GitLab


From c9fb2a51307ca8597b7d2d436fcdd28a88e78ba5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 01:40:14 -0700
Subject: [PATCH 158/610] Use ConstantDataArray to lower arrays of constants.

For large constants, creating an llvm::Constant for each element can get prohibitively large compile times.

PiperOrigin-RevId: 198843141
---
 .../compiler/xla/service/cpu/ir_emitter.cc    | 19 +++++---
 .../compiler/xla/service/cpu/ir_emitter.h     |  5 +-
 .../cpu/tests/cpu_external_constants_test.cc  |  4 +-
 .../cpu/tests/cpu_literal_caching_test.cc     | 22 ++++-----
 .../xla/service/cpu/tests/cpu_outfeed_test.cc |  2 +-
 .../compiler/xla/service/gpu/ir_emitter.cc    |  5 +-
 .../xla/service/llvm_ir/fused_ir_emitter.cc   |  4 +-
 .../compiler/xla/service/llvm_ir/llvm_util.cc | 47 +++++++++++++++++--
 8 files changed, 78 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index f6c8593632..a4141dee01 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -160,39 +160,44 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
-llvm::GlobalVariable* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
-  llvm::GlobalVariable* result;
+llvm::Constant* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
+  llvm::Constant* result;
 
   // We avoid creating large constants in the LLVM IR since LLVM is not
   // efficient for large constant arrays.  We still emit "small enough" constant
   // arrays into the Ir, in the off chance the LLVM optimizer can do something
   // interesting with it.
+  //
+  // TODO(b/29904935): Remove the large constant pool.
   const int kMaxInternalConstantSizeInBytes = 128;
   if (external_constant_pool_ &&
       ByteSizeOf(literal.shape()) >= kMaxInternalConstantSizeInBytes) {
     string global_name = tensorflow::strings::StrCat(
         "constant_global_", external_global_constant_counter_++);
-    result = new llvm::GlobalVariable(
+    llvm::GlobalVariable* result_global = new llvm::GlobalVariable(
         /*Module=*/*module_,
         /*Type=*/IrShapeType(literal.shape()),
         /*isConstant=*/true,
         /*Linkage=*/llvm::GlobalValue::ExternalLinkage,
         /*Initializer=*/nullptr,
         /*Name=*/AsStringRef(global_name));
-    result->setAlignment(MinimumAlignmentForShape(literal.shape()));
+    result_global->setAlignment(MinimumAlignmentForShape(literal.shape()));
     external_constant_pool_->Insert(global_name, literal,
                                     MinimumAlignmentForShape(literal.shape()));
+    result = result_global;
   } else {
     llvm::Constant* initializer =
         llvm_ir::ConvertLiteralToIrConstant(literal, module_);
-    result = new llvm::GlobalVariable(
+    llvm::GlobalVariable* result_global = new llvm::GlobalVariable(
         /*Module=*/*module_,
         /*Type=*/initializer->getType(),
         /*isConstant=*/true,
         /*Linkage=*/llvm::GlobalValue::PrivateLinkage,
         /*Initializer=*/initializer,
         /*Name=*/"");
-    result->setAlignment(MinimumAlignmentForShape(literal.shape()));
+    result_global->setAlignment(MinimumAlignmentForShape(literal.shape()));
+    result = llvm::ConstantExpr::getBitCast(
+        result_global, IrShapeType(literal.shape())->getPointerTo());
   }
   return result;
 }
@@ -200,7 +205,7 @@ llvm::GlobalVariable* IrEmitter::EmitGlobalForLiteral(const Literal& literal) {
 Status IrEmitter::HandleConstant(HloInstruction* constant) {
   VLOG(2) << "HandleConstant: " << constant->ToString();
   const Literal& literal = constant->literal();
-  llvm::GlobalVariable* global_for_const;
+  llvm::Constant* global_for_const;
 
   auto it = emitted_literals_.find(&literal);
   if (it != emitted_literals_.end()) {
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index f49cfc1dc3..32c536e18f 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -527,7 +527,8 @@ class IrEmitter : public DfsHloVisitorWithDefault {
   Status EmitXfeedTransfer(XfeedKind kind, const Shape& shape,
                            llvm::Value* program_buffer_address);
 
-  llvm::GlobalVariable* EmitGlobalForLiteral(const Literal& literal);
+  // Returns a ConstExpr bitcast.
+  llvm::Constant* EmitGlobalForLiteral(const Literal& literal);
 
   const HloModuleConfig& hlo_module_config_;
 
@@ -548,7 +549,7 @@ class IrEmitter : public DfsHloVisitorWithDefault {
     }
   };
 
-  tensorflow::gtl::FlatMap<const Literal*, llvm::GlobalVariable*,
+  tensorflow::gtl::FlatMap<const Literal*, llvm::Constant*,
                            LiteralPtrHashFunctor, LiteralPtrEqualityFunctor>
       emitted_literals_;
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index ed8f375bd6..faac927027 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -64,8 +64,8 @@ TEST_F(CpuExternalConstantsTest, BasicNegative) {
   // The constant array in this test case is small enough that there is no need
   // to externalize it.
   TestWithArray(/*rows=*/4, /*cols=*/4, R"(
-CHECK-NOT: @constant_global_0 = external constant [4 x [4 x float]], align 8
-CHECK: @0 = private constant [4 x [4 x float]] {{.*}}, align 8
+CHECK-NOT: @constant_global_0 = external constant [16 x float], align 8
+CHECK: @0 = private constant [16 x float] {{.*}}, align 8
 )");
 }
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index d6e0425c55..3cb25c5c19 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -55,8 +55,8 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [2 x [3 x [2 x float]]]
-CHECK-NOT: private constant [2 x [3 x [2 x float]]]
+CHECK: private constant [12 x float]
+CHECK-NOT: private constant [12 x float]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
@@ -78,30 +78,30 @@ TEST_F(CpuDuplicateConstantsTest, RepeatedTupleConstants) {
 HloModule RepeatedConstants
 
 while_body {
-  arg_body = (f32[2,1]{1,0}, f32[2]{0}) parameter(0)
-  ROOT const = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
+  arg_body = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
+  ROOT const = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
 }
 
 while_cond {
-  arg_cond = (f32[2,1]{1,0}, f32[2]{0}) parameter(0)
+  arg_cond = (f32[2,1]{1,0}, f32[1]{0}) parameter(0)
   ROOT unknown = pred[] infeed()
 }
 
 ENTRY main {
   param = f32[2,3,2] parameter(0)
-  const_a = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
-  const_b = (f32[2,1]{1,0}, f32[2]{0}) while((f32[2,1]{1,0}, f32[2]{0}) const_a), condition=while_cond, body=while_body
+  const_a = (f32[2,1]{1,0}, f32[1]{0}) constant((f32[2,1], f32[1]) ( f32[2,1] { { 1 }, { 2 } }, {2} ))
+  const_b = (f32[2,1]{1,0}, f32[1]{0}) while((f32[2,1]{1,0}, f32[1]{0}) const_a), condition=while_cond, body=while_body
 
-  out0 = () outfeed((f32[2,1]{1,0}, f32[2]{0}) const_a)
-  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[2]{0}) const_b)
+  out0 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_a)
+  ROOT out1 = () outfeed((f32[2,1]{1,0}, f32[1]{0}) const_b)
 }
 )";
 
   string filecheck_pattern = R"(
+CHECK: private constant [1 x float]
 CHECK: private constant [2 x float]
-CHECK: private constant [2 x [1 x float]]
+CHECK-NOT: private constant [1 x float]
 CHECK-NOT: private constant [2 x float]
-CHECK-NOT: private constant [2 x [1 x float]]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index 879372eb13..1a948fb4fe 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -37,7 +37,7 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [2 x [3 x [2 x float]]]
+CHECK: private constant [12 x float]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 1e0db2821a..547af33e9a 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -94,7 +94,10 @@ Status IrEmitter::HandleConstant(HloInstruction* constant) {
           << std::endl
           << "  its type: "
           << llvm_ir::DumpToString(*global_for_const->getType());
-  bindings_.BindHloToIrValue(*constant, global_for_const);
+  llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
+      global_for_const,
+      llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
+  bindings_.BindHloToIrValue(*constant, shape_constant);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index f172b1d87c..d909845a3a 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -80,8 +80,10 @@ Status FusedIrEmitter::HandleConstant(HloInstruction* constant) {
       *ir_builder_->GetInsertBlock()->getModule(), initializer->getType(),
       /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, initializer,
       /*Name=*/"");
+  llvm::Constant* shape_constant = llvm::ConstantExpr::getBitCast(
+      global, llvm_ir::ShapeToIrType(literal.shape(), module_)->getPointerTo());
   generators_[constant] = [=](const IrArray::Index& index) {
-    return IrArray(global, constant->shape())
+    return IrArray(shape_constant, constant->shape())
         .EmitReadArrayElement(index, ir_builder_);
   };
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index ec04239b4f..bd45f83fb1 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -368,15 +368,52 @@ llvm::Constant* LiteralToConstant(const Literal& literal, int64 dimension_index,
   return llvm::ConstantArray::get(aggregate_type, elements);
 }
 
+template <typename T>
+llvm::Constant* GetConstantDataArray(const Literal& literal,
+                                     llvm::Module* module) {
+  const T* data = static_cast<const T*>(literal.untyped_data());
+  int64 num_elements = literal.size_bytes() / sizeof(T);
+  return llvm::ConstantDataArray::get(module->getContext(),
+                                      llvm::makeArrayRef(data, num_elements));
+}
+
 }  // namespace
 
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
                                            llvm::Module* module) {
-  std::vector<int64> multi_index(ShapeUtil::Rank(literal.shape()), 0);
-  llvm::Constant* value = LiteralToConstant(
-      literal, /*dimension_index=*/ShapeUtil::Rank(literal.shape()) - 1,
-      &multi_index, module);
-  return value;
+  const Shape& shape = literal.shape();
+  // TODO(b/29904935): We can get rid of this switch by exposing a
+  // ConstantDataArray factory method that takes a llvm::Type and a StringRef.
+  switch (shape.element_type()) {
+    case U64:
+      return GetConstantDataArray<uint64>(literal, module);
+    case U32:
+      return GetConstantDataArray<uint32>(literal, module);
+    case U8:
+      return GetConstantDataArray<uint8>(literal, module);
+    case S64:
+      return GetConstantDataArray<int64>(literal, module);
+    case S32:
+      return GetConstantDataArray<int32>(literal, module);
+    case F64:
+      return GetConstantDataArray<double>(literal, module);
+    case F32:
+      return GetConstantDataArray<float>(literal, module);
+    case BF16:
+    case F16:
+      return GetConstantDataArray<uint16>(literal, module);
+    case PRED:
+      return GetConstantDataArray<bool>(literal, module);
+    // TODO(b/29904935): Also use ConstantDataArray for complex numbers.
+    case C64: {
+      int64 dimensions = ShapeUtil::Rank(shape);
+      std::vector<int64> multi_index(dimensions, 0);
+      return LiteralToConstant(literal, /*dimension_index=*/dimensions - 1,
+                               &multi_index, module);
+    }
+    default:
+      LOG(FATAL) << "unsupported type " << shape.element_type();
+  }
 }
 
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
-- 
GitLab


From 246a056bce8bdef5ffe9221355dc90b1e08448e9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 03:17:57 -0700
Subject: [PATCH 159/610] Fix a bug for unspecified dtype of acc_shape that can
 cause type mismatch.

PiperOrigin-RevId: 198850955
---
 tensorflow/python/ops/control_flow_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index ee024ce64a..2e5a801f8e 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -2729,7 +2729,8 @@ class WhileContext(ControlFlowContext):
           self.outer_context.Exit()
       else:
         shape_acc = array_ops.zeros_like(
-            array_ops.shape_internal(op.inputs[0], optimize=False),
+            array_ops.shape_internal(op.inputs[0], optimize=False,
+                                     out_type=dense_shape.dtype),
             optimize=False)
 
     if self.outer_context:
-- 
GitLab


From 347e69fd71430437e1dba6b9ae58b32e4a2f3c83 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 05:22:14 -0700
Subject: [PATCH 160/610] Support bfloat16 in LiteralBase::Slice

PiperOrigin-RevId: 198859282
---
 tensorflow/compiler/xla/literal_util.cc | 63 +++++++++----------------
 tensorflow/compiler/xla/literal_util.h  |  6 +++
 2 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 7563cc1e34..61afc311a7 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -987,6 +987,23 @@ std::unique_ptr<Literal> LiteralBase::Transpose(
   return new_literal;
 }
 
+template <typename NativeT>
+std::unique_ptr<Literal> LiteralBase::SliceInternal(
+    const Shape& result_shape,
+    tensorflow::gtl::ArraySlice<int64> start_indices) const {
+  auto result_literal = MakeUnique<Literal>(result_shape);
+  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
+  result_literal->EachCell<NativeT>(
+      [&](tensorflow::gtl::ArraySlice<int64> indices, NativeT /*value*/) {
+        for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
+          new_indices[i] = indices[i] + start_indices[i];
+        }
+        NativeT value = Get<NativeT>(new_indices);
+        result_literal->Set<NativeT>(indices, value);
+      });
+  return result_literal;
+}
+
 std::unique_ptr<Literal> LiteralBase::Slice(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices) const {
@@ -1004,51 +1021,17 @@ std::unique_ptr<Literal> LiteralBase::Slice(
   const auto result_shape =
       ShapeUtil::MakeShapeWithLayout(shape().element_type(), result_dimensions,
                                      LayoutUtil::MinorToMajor(shape()));
-
-  auto result_literal = MakeUnique<Literal>(result_shape);
-
-  DimensionVector new_indices(ShapeUtil::Rank(result_shape));
   switch (result_shape.element_type()) {
     case F32:
-      result_literal->EachCell<float>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, float /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            float value = Get<float>(new_indices);
-            result_literal->Set<float>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<float>(result_shape, start_indices);
+    case BF16:
+      return SliceInternal<bfloat16>(result_shape, start_indices);
     case C64:
-      result_literal->EachCell<complex64>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, complex64 /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            complex64 value = Get<complex64>(new_indices);
-            result_literal->Set<complex64>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<complex64>(result_shape, start_indices);
     case S32:
-      result_literal->EachCell<int32>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, int32 /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            int32 value = Get<int32>(new_indices);
-            result_literal->Set<int32>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<int32>(result_shape, start_indices);
     case U32:
-      result_literal->EachCell<uint32>(
-          [&](tensorflow::gtl::ArraySlice<int64> indices, uint32 /*value*/) {
-            for (int64 i = 0; i < ShapeUtil::Rank(result_shape); ++i) {
-              new_indices[i] = indices[i] + start_indices[i];
-            }
-            uint32 value = Get<uint32>(new_indices);
-            result_literal->Set<uint32>(indices, value);
-          });
-      return result_literal;
+      return SliceInternal<uint32>(result_shape, start_indices);
     default:
       LOG(FATAL) << "not yet implemented: "
                  << PrimitiveType_Name(result_shape.element_type());
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 2ca9060cc7..1e26eb7ad4 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -542,6 +542,12 @@ class LiteralBase {
   friend class Literal;
   friend class LiteralSlice;
   friend class BorrowingLiteral;
+
+ private:
+  template <typename NativeT>
+  std::unique_ptr<Literal> SliceInternal(
+      const Shape& result_shape,
+      tensorflow::gtl::ArraySlice<int64> start_indices) const;
 };
 
 // Class representing literal values in XLA.
-- 
GitLab


From 75a7b910904cc8993713cd6283beaeacc915a2a5 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 1 Jun 2018 06:06:34 -0700
Subject: [PATCH 161/610] Mark
 tensorflow/python/kernel_tests/linalg:linear_operator_identity_test as
 optonly due to flakiness.

PiperOrigin-RevId: 198862313
---
 tensorflow/python/kernel_tests/linalg/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 91be80322c..0123adc2c3 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -124,6 +124,7 @@ cuda_py_test(
         "//tensorflow/python:random_ops",
     ],
     shard_count = 5,
+    tags = ["optonly"],  # Test is flaky without optimization.
 )
 
 cuda_py_test(
-- 
GitLab


From e6aca210f1082e4cb8cf3d0f775a79042b48f68a Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Fri, 1 Jun 2018 06:17:33 -0700
Subject: [PATCH 162/610] Disable test on windows until we figure out what's
 wrong.

PiperOrigin-RevId: 198863091
---
 tensorflow/contrib/autograph/pyct/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/autograph/pyct/BUILD b/tensorflow/contrib/autograph/pyct/BUILD
index 796ab445c7..989b821e53 100644
--- a/tensorflow/contrib/autograph/pyct/BUILD
+++ b/tensorflow/contrib/autograph/pyct/BUILD
@@ -130,6 +130,7 @@ py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
-- 
GitLab


From 4349f663375ecbb7e678d1e86606380e42d431ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 07:30:28 -0700
Subject: [PATCH 163/610] Resubmitting CL 196349902: Adding cuDNN header
 dependency to targets that include the cuDNN header file.

PiperOrigin-RevId: 198869605
---
 tensorflow/contrib/fused_conv/BUILD     | 2 ++
 tensorflow/core/grappler/clusters/BUILD | 3 +++
 tensorflow/core/grappler/costs/BUILD    | 3 +++
 tensorflow/core/kernels/BUILD           | 4 ++--
 third_party/gpus/cuda/BUILD.tpl         | 9 +++++++++
 5 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/fused_conv/BUILD b/tensorflow/contrib/fused_conv/BUILD
index 0eb6889db1..0f0813c07f 100644
--- a/tensorflow/contrib/fused_conv/BUILD
+++ b/tensorflow/contrib/fused_conv/BUILD
@@ -75,6 +75,7 @@ tf_kernel_library(
         "//tensorflow/core/kernels:gpu_util_hdrs",
         "//tensorflow/core/kernels:ops_util_hdrs",
         "//third_party/eigen3",
+        "@local_config_cuda//cuda:cudnn_header",
     ],
     alwayslink = 1,
 )
@@ -94,6 +95,7 @@ tf_custom_op_library(
         "//tensorflow/core/kernels:conv_ops_gpu_hdrs",
         "//tensorflow/core/kernels:gpu_util_hdrs",
         "//tensorflow/core/kernels:ops_util_hdrs",
+        "@local_config_cuda//cuda:cudnn_header",
     ],
 )
 
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 30c6126fbb..d0b2cf01be 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -20,6 +20,9 @@ tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    cuda_deps = [
+        "@local_config_cuda//cuda:cudnn_header",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index 35f11eac29..b054068299 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -129,6 +129,9 @@ tf_cuda_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
+    cuda_deps = [
+        "@local_config_cuda//cuda:cudnn_header",
+    ],
     visibility = ["//visibility:public"],
     deps = [
         "//third_party/eigen3",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 5948f8d39f..f9e1d37b08 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -3300,7 +3300,7 @@ tf_kernel_library(
         "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
         "@cub_archive//:cub",
-        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:cudnn_header",
     ]),
 )
 
@@ -3319,7 +3319,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:nn_ops_op_lib",
     ] + if_cuda([
-        "@local_config_cuda//cuda:cudnn",
+        "@local_config_cuda//cuda:cudnn_header",
     ]),
 )
 
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 2a37c65bc7..f6b497f813 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -127,6 +127,15 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "cufft",
     srcs = ["cuda/lib/%{cufft_lib}"],
-- 
GitLab


From ccbb84022008c5a789b3767c3b1abf0806b4e3b6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 08:18:39 -0700
Subject: [PATCH 164/610] implement a generic reduce method so that later we
 can easily implement reduce_{sum,prod,etc}

PiperOrigin-RevId: 198874465
---
 .../internal/reference/reference_ops.h        | 131 +++++++++++++-----
 .../contrib/lite/testing/generate_examples.py | 122 +++++++++-------
 2 files changed, 166 insertions(+), 87 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index ef055929a9..ca5a20ad4f 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3505,63 +3505,124 @@ inline void Exp(const T* input_data, const size_t num_elements,
   }
 }
 
+// A generic reduce method that can be used for reduce_sum, reduce_mean, etc.
+// It takes a reducer function as input and returns false when numeric overflow
+// is detected.
+// This method iterates through input data and reduce elements along the
+// dimensions given in axis.
+template <typename In, typename Out>
+inline bool Reduce(const In* input_data, const int* input_dims,
+                   const int* output_dims, const int input_num_dims,
+                   const int output_num_dims, const int* axis,
+                   const int num_axis, int* input_iter,
+                   Out reducer(Out current, const In in, bool* overflow),
+                   Out* output_data) {
+  // Reset input iterator.
+  TFLITE_DCHECK(input_num_dims > 0);
+  for (int idx = 0; idx < input_num_dims; ++idx) {
+    input_iter[idx] = 0;
+  }
+  // Iterate through input_data.
+  do {
+    size_t input_offset =
+        ReducedOutputOffset(input_num_dims, input_dims, input_iter, 0, nullptr);
+    size_t output_offset = ReducedOutputOffset(input_num_dims, input_dims,
+                                               input_iter, num_axis, axis);
+    bool overflow = false;
+    output_data[output_offset] = reducer(output_data[output_offset],
+                                         input_data[input_offset], &overflow);
+    if (overflow) return false;
+  } while (NextIndex(input_num_dims, input_dims, input_iter));
+  return true;
+}
+
+inline bool ResolveAxis(const int num_dims, const int* axis, const int num_axis,
+                        int* out_axis, int* out_num_axis) {
+  *out_num_axis = 0;  // Just in case.
+  // o(n^2) is fine since out_num_axis should be really small, mostly <= 4
+  for (int idx = 0; idx < num_axis; ++idx) {
+    // Handle negative index.
+    int current = axis[idx] < 0 ? (axis[idx] + num_dims) : axis[idx];
+    TFLITE_DCHECK(current >= 0 && current < num_dims);
+    bool is_dup = false;
+    for (int j = 0; j < *out_num_axis; ++j) {
+      if (out_axis[j] == current) {
+        is_dup = true;
+        break;
+      }
+    }
+    if (!is_dup) {
+      out_axis[*out_num_axis] = current;
+      *out_num_axis += 1;
+    }
+  }
+  return true;
+}
+
+// This method expects that output_data has been initialized.
+template <typename In, typename Out>
+inline bool ReduceSumImpl(const In* input_data, const int* input_dims,
+                          const int* output_dims, const int input_num_dims,
+                          const int output_num_dims, const int* axis,
+                          const int num_axis, int* input_iter,
+                          Out* output_data) {
+  auto reducer = [](Out current, const In in, bool* overflow) -> Out {
+    const Out actual_in = static_cast<Out>(in);
+    return current + actual_in;
+  };
+  return Reduce<In, Out>(input_data, input_dims, output_dims, input_num_dims,
+                         output_num_dims, axis, num_axis, input_iter, reducer,
+                         output_data);
+}
+
+// Computes the mean of elements across dimensions given in axis.
+// It does so in two stages, first calculates the sum of elements along the axis
+// then divides it by the number of element in axis.
 template <typename T, typename U>
 inline bool Mean(const T* input_data, const int* input_dims,
                  const int input_num_dims, T* output_data,
                  const int* output_dims, const int output_num_dims,
                  const int* axis, const int num_axis_dimensions, bool keep_dims,
                  int* temp_index, int* resolved_axis, U* temp_sum) {
-  // resets output data.
+  // Reset output data.
   size_t num_outputs = 1;
   for (int idx = 0; idx < output_num_dims; ++idx) {
-    num_outputs *= static_cast<size_t>(output_dims[idx]);
+    size_t current = static_cast<size_t>(output_dims[idx]);
+    // Overflow prevention.
+    if (num_outputs > std::numeric_limits<size_t>::max() / current) {
+      return false;
+    }
+    num_outputs *= current;
   }
   for (size_t idx = 0; idx < num_outputs; ++idx) {
     output_data[idx] = T();
     temp_sum[idx] = U();
   }
-  // resets temp index.
-  for (int idx = 0; idx < input_num_dims; ++idx) {
-    temp_index[idx] = 0;
-  }
-  // resolves axis.
+
+  // Resolve axis.
   int num_resolved_axis = 0;
-  for (int idx = 0; idx < num_axis_dimensions; ++idx) {
-    int current = axis[idx];
-    TFLITE_DCHECK(current < input_num_dims && current + input_num_dims >= 0);
-    if (current < 0) {
-      current += input_num_dims;
-    }
-    bool is_dup = false;
-    for (int j = 0; j < num_resolved_axis; ++j) {
-      if (resolved_axis[j] == current) {
-        is_dup = true;
-        break;
-      }
-    }
-    if (!is_dup) {
-      resolved_axis[num_resolved_axis++] = current;
-    }
+  if (!ResolveAxis(input_num_dims, axis, num_axis_dimensions, resolved_axis,
+                   &num_resolved_axis)) {
+    return false;
   }
-  // iterates through input_data.
-  for (bool has_next = true; has_next;
-       has_next = NextIndex(input_num_dims, input_dims, temp_index)) {
-    size_t input_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, temp_index, 0, nullptr);
-    size_t output_offset =
-        ReducedOutputOffset(input_num_dims, input_dims, temp_index,
-                            num_resolved_axis, resolved_axis);
-    temp_sum[output_offset] += static_cast<U>(input_data[input_offset]);
-  }
-  // takes average by num of elements added to get mean.
-  size_t num_elements_in_axis = 1;
+
+  if (!ReduceSumImpl<T, U>(input_data, input_dims, output_dims, input_num_dims,
+                           output_num_dims, resolved_axis, num_resolved_axis,
+                           temp_index, temp_sum)) {
+    return false;
+  }
+
+  // Calculate mean by dividing output_data by num of aggregated element.
+  U num_elements_in_axis = 1;
   for (int idx = 0; idx < num_resolved_axis; ++idx) {
     size_t current = static_cast<size_t>(input_dims[resolved_axis[idx]]);
+    // Overflow prevention.
     if (current > (std::numeric_limits<U>::max() / num_elements_in_axis)) {
       return false;
     }
     num_elements_in_axis *= current;
   }
+
   if (num_elements_in_axis > 0) {
     for (size_t idx = 0; idx < num_outputs; ++idx) {
       output_data[idx] =
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index ae66bd858b..6a6d12ed67 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -744,65 +744,83 @@ def make_binary_op_tests(zip_path, binary_operator):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_mean_tests(zip_path):
-  """Make a set of tests to do mean."""
+def make_reduce_tests(reduce_op):
+  """Make a set of tests to do reduce operation.
 
-  test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32, tf.int64],
-      "input_shape": [[3, 2, 4]],
-      "axis": [
-          None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-          [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
-          [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
-      ],
-      "const_axis": [True, False],
-      "keepdims": [True, False],
-  }, {
-      "input_dtype": [tf.float32],
-      "input_shape": [[1, 8, 8, 3]],
-      "axis": [
-          None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
-          [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
-          -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
-          [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
-      ],
-      "const_axis": [True, False],
-      "keepdims": [True, False],
-  }]
+  Args:
+    reduce_op: TensorFlow reduce operation to test, i.e. `tf.reduce_mean`.
 
-  def build_graph(parameters):
-    """Build the mean op testing graph."""
-    input_tensor = tf.placeholder(
-        dtype=parameters["input_dtype"],
-        name="input",
-        shape=parameters["input_shape"])
+  Returns:
+    a function representing the true generator with `reduce_op_in` curried.
+  """
 
-    # Get axis as either a placeholder or constants.
-    if parameters["const_axis"]:
-      axis = parameters["axis"]
-      input_tensors = [input_tensor]
-    else:
-      if isinstance(parameters["axis"], list):
-        shape = [len(parameters["axis"])]
+  def f(zip_path):
+    """Actual function that generates examples."""
+
+    test_parameters = [{
+        "input_dtype": [tf.float32, tf.int32, tf.int64],
+        "input_shape": [[3, 2, 4]],
+        "axis": [
+            None, 0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+            [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
+            [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+        ],
+        "const_axis": [True, False],
+        "keepdims": [True, False],
+    }, {
+        "input_dtype": [tf.float32],
+        "input_shape": [[1, 8, 8, 3]],
+        "axis": [
+            None, 0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
+            [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
+            -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
+            [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+        ],
+        "const_axis": [True, False],
+        "keepdims": [True, False],
+    }]
+
+    def build_graph(parameters):
+      """Build the mean op testing graph."""
+      input_tensor = tf.placeholder(
+          dtype=parameters["input_dtype"],
+          name="input",
+          shape=parameters["input_shape"])
+
+      # Get axis as either a placeholder or constants.
+      if parameters["const_axis"]:
+        axis = parameters["axis"]
+        input_tensors = [input_tensor]
       else:
-        shape = [0]  # shape for None or integers.
-      axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
-      input_tensors = [input_tensor, axis]
+        if isinstance(parameters["axis"], list):
+          shape = [len(parameters["axis"])]
+        else:
+          shape = [0]  # shape for None or integers.
+        axis = tf.placeholder(dtype=tf.int32, name="axis", shape=shape)
+        input_tensors = [input_tensor, axis]
 
-    out = tf.reduce_mean(
-        input_tensor, axis=axis, keepdims=parameters["keepdims"])
-    return input_tensors, [out]
+      out = reduce_op(
+          input_tensor, axis=axis, keepdims=parameters["keepdims"])
+      return input_tensors, [out]
 
-  def build_inputs(parameters, sess, inputs, outputs):
-    values = [
-        create_tensor_data(parameters["input_dtype"], parameters["input_shape"])
-    ]
-    if not parameters["const_axis"]:
-      if parameters["axis"]:
-        values.append(np.array(parameters["axis"]))
-    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+    def build_inputs(parameters, sess, inputs, outputs):
+      values = [
+          create_tensor_data(parameters["input_dtype"],
+                             parameters["input_shape"])]
+      if not parameters["const_axis"]:
+        if parameters["axis"]:
+          values.append(np.array(parameters["axis"]))
+      return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+  return f
+
+
+def make_mean_tests(zip_path):
+  """Make a set of tests to do mean."""
+
+  return make_reduce_tests(tf.reduce_mean)(zip_path)
 
 
 def make_exp_tests(zip_path):
-- 
GitLab


From 46cd11058d049362b3ec813c7c07193449242eb3 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 1 Jun 2018 08:34:24 -0700
Subject: [PATCH 165/610] Automated g4 rollback of changelist 198810875

PiperOrigin-RevId: 198876135
---
 tensorflow/compiler/jit/xla_device_ops.h      | 11 ++-
 tensorflow/contrib/tpu/python/tpu/tpu.py      | 87 +++++++++++++++++--
 tensorflow/contrib/tpu/python/tpu/tpu_test.py |  4 +-
 tensorflow/core/kernels/control_flow_ops.cc   | 22 ++---
 tensorflow/core/kernels/control_flow_ops.h    | 16 ++++
 5 files changed, 117 insertions(+), 23 deletions(-)

diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index b27c32e9bc..0c49286acd 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -95,7 +95,16 @@ class XlaAssignVariableOp : public AsyncOpKernel {
   REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE).HostMemory("pred"),    \
                           SwitchOp);                                           \
   REGISTER_KERNEL_BUILDER(                                                     \
-      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);
+      Name("Merge").Device(DEVICE).HostMemory("value_index"), MergeOp);        \
+  REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE), EnterOp);              \
+  REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE), ExitOp);                \
+  REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE),                \
+                          NextIterationOp);                                    \
+  REGISTER_KERNEL_BUILDER(Name("LoopCond")                                     \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("input")                             \
+                              .HostMemory("output"),                           \
+                          LoopCondOp);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 612cd0114b..71a5012691 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -126,7 +126,19 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
   outside the replicated computation.
   """
 
-  def __init__(self, name, num_replicas):
+  def __init__(self, name, num_replicas, pivot):
+    """Builds a new TPUReplicateContext.
+
+    Args:
+      name: a unique name for the context, used to populate the `_tpu_replicate`
+        attribute.
+      num_replicas: an integer that gives the number of replicas for the
+        computation.
+      pivot: a pivot node. Nodes in the TPUReplicateContext that do not have any
+        inputs will have a control dependency on the pivot node. This ensures
+        that nodes are correctly included in any enclosing control flow
+        contexts.
+    """
     super(TPUReplicateContext, self).__init__()
     self._num_replicas = num_replicas
     self._outer_device_function_stack = None
@@ -138,6 +150,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._host_compute_core = []
     self._name = name
     self._unsupported_ops = []
+    self._pivot = pivot
 
   def report_unsupported_operations(self):
     if self._unsupported_ops:
@@ -262,9 +275,6 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       self._outer_device_function_stack = list(graph._device_function_stack)  # pylint: disable=protected-access
     super(TPUReplicateContext, self).Enter()
 
-  def Exit(self):
-    super(TPUReplicateContext, self).Exit()
-
   def HostComputeCore(self):
     return self._host_compute_core
 
@@ -300,10 +310,64 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       op.graph.prevent_feeding(op)
       op.graph.prevent_fetching(op)
 
+    # Remove any control edges from outer control flow contexts. These may cause
+    # mismatched frame errors.
+    control_inputs, external_inputs = self._RemoveExternalControlEdges(op)
+
+    if not op.inputs:
+      # Add a control edge from the control pivot to this op.
+      if not control_inputs:
+        # pylint: disable=protected-access
+        op._add_control_input(self.GetControlPivot())
+        # pylint: enable=protected-access
+    else:
+      for index in xrange(len(op.inputs)):
+        x = op.inputs[index]
+        real_x = self.AddValue(x)
+        if real_x != x:
+          op._update_input(index, real_x)  # pylint: disable=protected-access
+
+    if external_inputs:
+      # Use an identity to pull control inputs as data inputs. Note that we
+      # ignore ops which don't have outputs. TODO(phawkins): fix that.
+      with ops.control_dependencies(None):
+        self.Enter()
+        external_inputs = [
+            array_ops.identity(x.outputs[0]).op
+            for x in external_inputs
+            if x.outputs
+        ]
+        self.Exit()
+      # pylint: disable=protected-access
+      op._add_control_inputs(external_inputs)
+      # pylint: enable=protected-access
+
+    # Mark op's outputs as seen by this context and any outer contexts.
+    output_names = [x.name for x in op.outputs]
+    context = self
+    while context is not None:
+      # pylint: disable=protected-access
+      context._values.update(output_names)
+      context = context._outer_context
+      # pylint: enable=protected-access
+
+    if self._outer_context:
+      self._outer_context.AddInnerOp(op)
+
   def AddValue(self, val):
+    if val.name in self._values:
+      # Use the real value if it comes from outer context.
+      result = self._external_values.get(val.name)
+      return val if result is None else result
+
     result = val
+    self._values.add(val.name)
     if self._outer_context:
       result = self._outer_context.AddValue(val)
+      self._values.add(result.name)
+
+    self._external_values[val.name] = result
+
     return result
 
   def AddInnerOp(self, op):
@@ -319,6 +383,16 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     # grad_state should be as if this is the top-level gradient state.
     return None
 
+  @property
+  def back_prop(self):
+    """Forwards to the enclosing while context, if any."""
+    if self.GetWhileContext():
+      return self.GetWhileContext().back_prop
+    return False
+
+  def GetControlPivot(self):
+    return self._pivot
+
 
 def outside_compilation(computation, *args, **kwargs):
   """Builds part of a computation outside any current TPU replicate scope.
@@ -505,7 +579,9 @@ def split_compile_and_replicate(computation,
         tpu_ops.tpu_replicated_input(replicas, name="input{}".format(i)))
 
   cluster_name = graph.unique_name("cluster")
-  context = TPUReplicateContext(name=cluster_name, num_replicas=num_replicas)
+  pivot = control_flow_ops.no_op(name=cluster_name + "/pivot")
+  context = TPUReplicateContext(
+      name=cluster_name, num_replicas=num_replicas, pivot=pivot)
   try:
     context.Enter()
 
@@ -582,6 +658,7 @@ def split_compile_and_replicate(computation,
       with ops.device(t.device if t.device else core(0)):
         new_output_tensors.append(array_ops.identity(t))
     output_tensors = new_output_tensors
+    context.ExitResult(output_tensors)
   finally:
     context.report_unsupported_operations()
     context.Exit()
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
index c3882b8a27..6bdaa528f9 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_test.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.tpu.python.tpu import training_loop
 from tensorflow.python.framework import dtypes
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import math_ops
 
@@ -37,7 +38,8 @@ class TPUContextTest(test.TestCase):
   def testIsInContext(self):
     """Test that control_flow_util can check that we're in a TPU context."""
     z1 = array_ops.identity(1)
-    context = tpu.TPUReplicateContext(b"context", 1)
+    pivot = control_flow_ops.no_op()
+    context = tpu.TPUReplicateContext(b"context", 1, pivot=pivot)
     context.Enter()
     z2 = array_ops.identity(1)
     context.Exit()
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 7d5d54e5be..ebf844d75f 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -587,24 +587,14 @@ REGISTER_SYCL_HOST_KERNEL(string);
 #undef REGISTER_SYCL_HOST_KERNEL
 #endif  // TENSORFLOW_USE_SYCL
 
-// A LoopCond op has one input and one output. The input is a boolean
-// scalar representing the taken branches of the "pivot" Switch that
-// determines loop termination. As a contract, any high-level front-end
-// should always use port '0' of the "pivot" switches for loop exit.
-class LoopCondOp : public OpKernel {
- public:
-  explicit LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
-
-  void Compute(OpKernelContext* context) override {
-    context->set_output(0, context->input(0));
-  }
+LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
+LoopCondOp::~LoopCondOp() = default;
 
-  bool IsExpensive() override { return false; }
-
-  ~LoopCondOp() override {}
+void LoopCondOp::Compute(OpKernelContext* context) {
+  context->set_output(0, context->input(0));
+}
 
-  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
-};
+bool LoopCondOp::IsExpensive() { return false; }
 
 REGISTER_KERNEL_BUILDER(Name("LoopCond").Device(DEVICE_CPU), LoopCondOp);
 REGISTER_KERNEL_BUILDER(Name("LoopCond")
diff --git a/tensorflow/core/kernels/control_flow_ops.h b/tensorflow/core/kernels/control_flow_ops.h
index 4838f2e2bf..8edbcc9077 100644
--- a/tensorflow/core/kernels/control_flow_ops.h
+++ b/tensorflow/core/kernels/control_flow_ops.h
@@ -97,6 +97,22 @@ class NextIterationOp : public OpKernel {
   TF_DISALLOW_COPY_AND_ASSIGN(NextIterationOp);
 };
 
+// A LoopCond op has one input and one output. The input is a boolean
+// scalar representing the taken branches of the "pivot" Switch that
+// determines loop termination. As a contract, any high-level front-end
+// should always use port '0' of the "pivot" switches for loop exit.
+class LoopCondOp : public OpKernel {
+ public:
+  explicit LoopCondOp(OpKernelConstruction* context);
+  ~LoopCondOp() override;
+
+  void Compute(OpKernelContext* context) override;
+
+  bool IsExpensive() override;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(LoopCondOp);
+};
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_KERNELS_CONTROL_FLOW_OPS_H_
-- 
GitLab


From 6bb35f848a7164d3f5a696826b9659b1bd24fed0 Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Fri, 1 Jun 2018 08:52:46 -0700
Subject: [PATCH 166/610] Automated g4 rollback of changelist 198815200

PiperOrigin-RevId: 198878259
---
 .../contrib/data/kernels/csv_dataset_op.cc    | 542 +++++++++++++-----
 .../contrib/data/python/kernel_tests/BUILD    |   1 +
 .../kernel_tests/csv_dataset_op_test.py       | 292 ++++++++--
 tensorflow/core/lib/strings/numbers.cc        |  26 +-
 tensorflow/core/lib/strings/numbers.h         |   4 +-
 5 files changed, 646 insertions(+), 219 deletions(-)

diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index 97cc0bc6c9..e88ad3dc32 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/lib/io/buffered_inputstream.h"
 #include "tensorflow/core/lib/io/random_inputstream.h"
 
 namespace tensorflow {
@@ -103,12 +102,11 @@ class CSVDatasetOp : public DatasetOpKernel {
     OP_REQUIRES(
         ctx, select_cols.empty() || select_cols.front() >= 0,
         errors::InvalidArgument("select_cols should be non-negative indices"));
-    bool select_all_cols = select_cols.empty();
 
-    *output = new Dataset(
-        ctx, std::move(filenames), header, buffer_size, output_types_,
-        output_shapes_, std::move(record_defaults), std::move(select_cols),
-        select_all_cols, use_quote_delim, delim[0], std::move(na_value));
+    *output = new Dataset(ctx, std::move(filenames), header, buffer_size,
+                          output_types_, output_shapes_,
+                          std::move(record_defaults), std::move(select_cols),
+                          use_quote_delim, delim[0], std::move(na_value));
   }
 
  private:
@@ -118,8 +116,7 @@ class CSVDatasetOp : public DatasetOpKernel {
             int64 buffer_size, const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
             std::vector<Tensor> record_defaults, std::vector<int64> select_cols,
-            bool select_all_cols, bool use_quote_delim, char delim,
-            string na_value)
+            bool use_quote_delim, char delim, string na_value)
         : GraphDatasetBase(ctx),
           filenames_(std::move(filenames)),
           header_(header),
@@ -128,7 +125,6 @@ class CSVDatasetOp : public DatasetOpKernel {
           output_shapes_(output_shapes),
           record_defaults_(std::move(record_defaults)),
           select_cols_(std::move(select_cols)),
-          select_all_cols_(select_all_cols),
           use_quote_delim_(use_quote_delim),
           delim_(delim),
           na_value_(std::move(na_value)) {}
@@ -166,11 +162,24 @@ class CSVDatasetOp : public DatasetOpKernel {
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
         mutex_lock l(mu_);
+        bool select_all = dataset()->select_cols_.empty();
         do {
           // We are currently processing a file, so try to read the next record
-          if (buffered_input_stream_) {
-            Status s = ReadRecord(ctx, out_tensors);
-            if (s.ok() || !errors::IsOutOfRange(s)) {
+          if (input_stream_) {
+            Status s = ReadRecord(ctx, out_tensors, select_all,
+                                  dataset()->select_cols_);
+            if (s.ok()) {
+              // Validate output
+              if (out_tensors->size() != dataset()->out_type_.size()) {
+                return errors::InvalidArgument(
+                    "Expect ", dataset()->out_type_.size(), " fields but have ",
+                    out_tensors->size(), " in record");
+              }
+
+              *end_of_sequence = false;
+              return s;
+            }
+            if (!errors::IsOutOfRange(s)) {
               // Not at the end of file, return OK or non-EOF errors to caller.
               *end_of_sequence = false;
               return s;
@@ -203,145 +212,341 @@ class CSVDatasetOp : public DatasetOpKernel {
       }
 
      private:
-      // Reads a record by parsing the input buffer, and converting extracted
+      // Reads an entire CSV row from the input stream, either from the
+      // existing buffer or by filling the buffer as needed. Converts extracted
       // fields to output tensors as we go.
-      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors)
+      //
+      // When this function is called, pos_ should be the index of the first
+      // character of the record in buffer_, or past the end of the buffer.
+      // Note: ctx and out_tensors are only used in this function
+      // when fields are included in the record.
+      Status ReadRecord(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                        bool select_all, const std::vector<int64>& selected)
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Extracts fields from line(s) from the buffered input stream.
-        out_tensors->reserve(dataset()->record_defaults_.size());
-
-        string input;
-        TF_RETURN_IF_ERROR(buffered_input_stream_->ReadLine(&input));
-
-        size_t current_idx = 0;
-        size_t num_fields_parsed = 0;
-        size_t selector_idx = 0;  // Keep track of index into select_cols
-
-        while (current_idx < input.size()) {
-          // In each iteration, parse one field
-          if (input[current_idx] == '\n' || input[current_idx] == '\r') {
-            // This should never happen, because buffered input reader splits
-            // input on newlines.
-            return errors::InvalidArgument("Parsing error.");
-          }
+        if (pos_ >= buffer_.size()) {
+          // At the end of the file, this will return errors::OutOfRange
+          TF_RETURN_IF_ERROR(FillBuffer(&buffer_));
+          pos_ = 0;
+        }
+
+        // The first character may be \n if this is the continuation of a
+        // \r\n linebreak between this and the previous record. If so, skip it.
+
+        bool end_of_record = false;  // Keep track of when we find \n, \r or EOF
+        size_t num_parsed = 0;
+        size_t num_selected_parsed = 0;
 
-          bool quoted = false;
+        Status result = Status::OK();
+
+        while (!end_of_record) {  // Read till we reach \n, \r or EOF
           bool include =
-              (dataset()->select_all_cols_ ||
-               dataset()->select_cols_[selector_idx] == num_fields_parsed);
+              select_all || (num_selected_parsed < selected.size() &&
+                             selected[num_selected_parsed] == num_parsed);
+
+          // Don't fail fast, so that the next call to GetNext may still return
+          // a valid record
+          result.Update(
+              ParseOneField(ctx, out_tensors, &end_of_record, include));
 
-          if (dataset()->use_quote_delim_ && input[current_idx] == '"') {
-            quoted = true;
-            current_idx++;
+          num_parsed++;
+          if (include) num_selected_parsed++;
+        }
+
+        return result;
+      }
+
+      // Parses one field from position pos_ in the buffer. Fields are
+      // delimited by delim, CRLF, or EOF. Advances pos_ to the first char of
+      // the next field.
+      Status ParseOneField(IteratorContext* ctx,
+                           std::vector<Tensor>* out_tensors,
+                           bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          // If we get here, this means the previous field's end coincided
+          // with the end of the buffer. We can fill the buffer without abandon.
+          Status s = FillBuffer(&buffer_);
+
+          if (errors::IsOutOfRange(s)) {
+            // Reached EOF, and last field is empty
+            *end_of_record = true;
+            if (include) {
+              return FieldToOutput(ctx, StringPiece(), out_tensors);
+            } else {
+              return Status::OK();
+            }
+          } else if (!s.ok()) {
+            return s;  // Surface other errors back to caller
           }
 
-          // Parse the body of the field
-          string field;
-          if (!quoted) {
-            while (current_idx < input.size() &&
-                   input[current_idx] != dataset()->delim_) {
-              if ((dataset()->use_quote_delim_ && input[current_idx] == '"') ||
-                  input[current_idx] == '\n' || input[current_idx] == '\r') {
-                return errors::InvalidArgument(
-                    "Unquoted fields cannot have quotes/CRLFs inside");
+          pos_ = 0;
+        }
+
+        if (dataset()->use_quote_delim_ && buffer_[pos_] == '"') {
+          return ParseQuotedField(ctx, out_tensors, end_of_record, include);
+        }
+
+        return ParseUnquotedField(ctx, out_tensors, end_of_record, include);
+      }
+
+      // For keeping track of relevant parts of a field from a previous buffer
+      struct Piece {
+        size_t start;
+        size_t len;
+        string buffer;
+
+        Piece(string buffer, size_t start, size_t len)
+            : start(start), len(len), buffer(std::move(buffer)) {}
+      };
+
+      // Given that pos_ exceeds the buffer, saves the relevant part of the
+      // current buffer (if necessary), fills the buffer, and resets indices to
+      // 0.
+      Status SaveAndFillBuffer(std::vector<Piece>* earlier_pieces,
+                               size_t* start, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        string temp_buffer;
+
+        buffer_.swap(temp_buffer);
+        if (include && pos_ > *start) {
+          earlier_pieces->push_back(
+              Piece(std::move(temp_buffer), *start, pos_ - *start));
+        }
+        pos_ = 0;
+        *start = 0;
+        return FillBuffer(&buffer_);
+      }
+
+      // Parses unquoted field from position pos_ in the buffer. Continually
+      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
+      // Advances pos_ to keep track of our position in the buffer as we go,
+      // stopping at the first character of the next field.
+      Status ParseQuotedField(IteratorContext* ctx,
+                              std::vector<Tensor>* out_tensors,
+                              bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<Piece> earlier_pieces;
+        size_t start = pos_;
+        pos_++;  // Starting quotation mark
+
+        while (true) {  // Each iter reads 1 char, filling buffer if necessary
+          if (pos_ >= buffer_.size()) {
+            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+            if (errors::IsOutOfRange(s)) {
+              return errors::InvalidArgument(
+                  "Reached end of file without closing quoted field in "
+                  "record");
+            } else if (!s.ok()) {
+              return s;  // Surface all other errors to caller
+            }
+          }
+
+          char ch = buffer_[pos_];
+          if (ch == '"') {
+            // When we encounter a quote, we look ahead to the next character to
+            // decide what to do
+            pos_++;
+            if (pos_ >= buffer_.size()) {
+              Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+              if (errors::IsOutOfRange(s)) {
+                // This was the last field. We are done
+                *end_of_record = true;
+                return QuotedFieldToOutput(ctx, StringPiece(), out_tensors,
+                                           earlier_pieces, include);
+              } else if (!s.ok()) {
+                return s;
               }
-              if (include) field += input[current_idx];
-              current_idx++;
-            }  // Exit condition: end of input, or current index at delim
+            }
+
+            char next = buffer_[pos_];
+            pos_++;
+            if (next == dataset()->delim_) {
+              return QuotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  out_tensors, earlier_pieces, include);
+
+            } else if (next == '\n' || next == '\r') {
+              *end_of_record = true;
+              Status s = QuotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
+                  out_tensors, earlier_pieces, include);
+              if (next == '\r') SkipNewLineIfNecessary();
+              return s;
+            } else if (next != '"') {
+              return errors::InvalidArgument(
+                  "Quote inside a string has to be escaped by another quote");
+            }
 
-            // Go to next field or the end
-            current_idx++;
           } else {
-            // Quoted field needs to be ended with '"' and delim or end
-            while (true) {
-              if (current_idx >= input.size() - 1 || input.empty()) {
-                if (current_idx == input.size() - 1 &&
-                    input[current_idx] == '"') {
-                  // We're at the end of the input, and the quote terminates the
-                  // record. Go to end.
-                  current_idx++;
-                  break;
-                }
-                // If there's no terminating quote, it means our buffered record
-                // line reader split a record up. This can happen if there is a
-                // newline encased in quotes. The next line is also part of the
-                // record, so we read it and reset the index.
-                if (include && current_idx == input.size() - 1) {
-                  // TODO(rachelim): Instead of building up a string, keep track
-                  //  of terminal indices (or starting char* and length)
-                  // Also look into using /lib/strings/Scanner
-                  field += input[current_idx];
-                }
-                if (include) {
-                  field += '\n';
-                }
-                current_idx = 0;
-                Status s = buffered_input_stream_->ReadLine(&input);
-                if (!s.ok()) {
-                  return errors::InvalidArgument(
-                      "Quoted field has to end with quote followed by delim, "
-                      "CRLF, or EOF");
-                }
-              } else if (input[current_idx] == '"' &&
-                         input[current_idx + 1] == dataset()->delim_) {
-                // End of field, go to next field or end
-                current_idx += 2;
-                break;
-              } else if (input[current_idx] == '"') {
-                // Current char is a quote. Since we're not at end of field,
-                // the next character must also be a quote.
-                if (input[current_idx + 1] != '"') {
-                  return errors::InvalidArgument(
-                      "Quote inside a string has to be escaped by another "
-                      "quote");
-                }
-                if (include) field += '"';
-                current_idx += 2;
-              } else {
-                if (include) field += input[current_idx];
-                current_idx++;
-              }
+            pos_++;
+          }
+        }
+      }
+
+      // Converts quoted field to an output tensor, removing the starting
+      // and ending quotes from it and unescaping double quotations if
+      // necessary.
+      Status QuotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+                                 std::vector<Tensor>* out_tensors,
+                                 const std::vector<Piece>& earlier_pieces,
+                                 bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!include) return Status::OK();
+
+        if (earlier_pieces.empty()) {
+          if (field.find('\"', 1) == field.size() - 1) {
+            // `field` contains no escaped quotation marks.
+            // Exclude framing quotation marks
+            field.remove_prefix(1);
+            field.remove_suffix(1);
+            return FieldToOutput(ctx, field, out_tensors);
+          }
+        }
+        string field_complete;
+        size_t str_len = field.size();
+        for (const Piece& p : earlier_pieces) {
+          str_len += p.len;
+        }
+        field_complete.reserve(str_len);
+
+        // This bool flips every time we see a quote, so that we skip the second
+        // quote of every pair of adjacent quotes in the field. We need to track
+        // this across iterations of the for loop because adjacent double quotes
+        // may be in different buffers. Initialize to true because we also skip
+        // the opening quotation mark of the quoted field.
+        bool skip_next_quote = true;
+        for (const Piece& p : earlier_pieces) {
+          AppendUnescapedPiece(StringPiece(&p.buffer[p.start], p.len),
+                               &field_complete, &skip_next_quote);
+        }
+        AppendUnescapedPiece(field, &field_complete, &skip_next_quote);
+        StringPiece result = StringPiece(field_complete);
+        result.remove_suffix(1);  // Skip final quote
+
+        return FieldToOutput(ctx, result, out_tensors);
+      }
+
+      void AppendUnescapedPiece(StringPiece piece, string* field_complete,
+                                bool* skip_next_quote) {
+        size_t from = 0;
+        size_t found = piece.find('\"', from);
+        while (found != string::npos) {
+          if (!*skip_next_quote) {
+            // This is the first quote in a pair of adjacent double quotes
+            field_complete->append(piece.data() + from, found + 1 - from);
+          }
+          *skip_next_quote = !*skip_next_quote;
+          from = found + 1;
+          found = piece.find('\"', from);
+        }
+        // Include the chunk after the last quotation mark in the string
+        if (from < piece.size()) {
+          field_complete->append(piece.data() + from, piece.size() - from);
+        }
+      }
+
+      // Parses unquoted field from position pos_ in the buffer. Continually
+      // reads from buffer until end of field is reached (delim, CRLF, or EOF).
+      // Advances pos_ to keep track of our position in the buffer as we go,
+      // stopping at the first character of the next field.
+      Status ParseUnquotedField(IteratorContext* ctx,
+                                std::vector<Tensor>* out_tensors,
+                                bool* end_of_record, bool include)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        std::vector<Piece> earlier_pieces;
+        size_t start = pos_;
+        while (true) {  // Each iter reads 1 char, filling buffer if necessary
+          if (pos_ >= buffer_.size()) {
+            Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
+            // Handle errors
+            if (errors::IsOutOfRange(s)) {
+              // Whatever we have is the last field of the last record
+              *end_of_record = true;
+              return UnquotedFieldToOutput(
+                  ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                  earlier_pieces, include);
+            } else if (!s.ok()) {
+              return s;  // Surface all other errors to caller
             }
           }
 
-          num_fields_parsed++;
+          char ch = buffer_[pos_];
 
-          if (include) {
-            // Add the tensor to the result
-            TF_RETURN_IF_ERROR(FieldToOutput(ctx, std::move(field),
-                                             selector_idx, out_tensors));
-            selector_idx++;
-            // Terminate early if we have all the fields we want
-            if (selector_idx == dataset()->select_cols_.size())
-              return Status::OK();
+          if (ch == dataset()->delim_) {
+            Status s = UnquotedFieldToOutput(
+                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                earlier_pieces, include);
+            pos_++;
+            return s;
+          }
+          if (ch == '\n' || ch == '\r') {
+            // need special case to skip over first \n of record if the line
+            // breaks are \r\n
+            Status s = UnquotedFieldToOutput(
+                ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
+                earlier_pieces, include);
+            *end_of_record = true;
+            pos_++;
+            if (ch == '\r') SkipNewLineIfNecessary();
+            return s;
           }
-        }  // Exit condition: current_idx has reached the end of record
-
-        // Check if the last field is empty, and include it if necessary
-        bool include =
-            (dataset()->select_all_cols_ ||
-             dataset()->select_cols_[selector_idx] == num_fields_parsed);
-        if (include && !input.empty() &&
-            input[input.size() - 1] == dataset()->delim_) {
-          TF_RETURN_IF_ERROR(
-              FieldToOutput(ctx, string(), selector_idx, out_tensors));
+          if (dataset()->use_quote_delim_ && ch == '"') {
+            // Advance pos_ to the next field anyway so that we can ignore
+            // errors gracefully if required. The caller of this will be able to
+            // call ParseOneField and continue with the rest of the record.
+            AdvanceToNextField(end_of_record);
+            return errors::InvalidArgument(
+                "Unquoted fields cannot have quotes inside");
+          }
+          // Otherwise, go to next character
+          pos_++;
         }
+      }
 
-        // Check that number of fields matches
-        if (out_tensors->size() != dataset()->out_type_.size()) {
-          return errors::InvalidArgument("Expect ", dataset()->out_type_.size(),
-                                         " fields but have ",
-                                         out_tensors->size(), " in record");
+      // Advances pos_ to the start of the next field, as delimited by delim,
+      // CRLF, or EOF, ignoring errors, and not keeping track of characters in
+      // the current field.
+      void AdvanceToNextField(bool* end_of_record)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        while (true) {
+          if (pos_ >= buffer_.size()) {
+            Status s = FillBuffer(&buffer_);
+            pos_ = 0;
+            if (!s.ok()) {
+              *end_of_record = true;
+              return;
+            }
+          }
+
+          char ch = buffer_[pos_];
+          pos_++;
+
+          if (ch == dataset()->delim_) {
+            return;
+          }
+
+          if (ch == '\n' || ch == '\r') {
+            *end_of_record = true;
+            if (ch == '\r') SkipNewLineIfNecessary();
+            return;
+          }
         }
-        return Status::OK();
       }
 
-      // Given a string field, and its index in the output,
-      // converts it to a Tensor of the right type and adds it to the
-      // out_tensors vector.
-      Status FieldToOutput(IteratorContext* ctx, string field,
-                           size_t output_idx,
+      Status FillBuffer(string* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        result->clear();
+        Status s = input_stream_->ReadNBytes(dataset()->buffer_size_, result);
+
+        if (errors::IsOutOfRange(s) && !result->empty()) {
+          // Ignore OutOfRange error when ReadNBytes read < N bytes.
+          return Status::OK();
+        }
+        return s;
+      }
+
+      // Given a field, converts it to the right output tensor type
+      Status FieldToOutput(IteratorContext* ctx, StringPiece field,
                            std::vector<Tensor>* out_tensors) {
+        size_t output_idx = out_tensors->size();
         if (output_idx >= dataset()->out_type_.size()) {
           // We can get here if we're selecting all columns, but the number of
           // fields exceeds the number of defaults provided
@@ -397,7 +602,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                   dataset()->record_defaults_[output_idx].flat<float>()(0);
             } else {
               float value;
-              if (!strings::safe_strtof(field.c_str(), &value)) {
+              if (!strings::safe_strtof(field, &value)) {
                 return errors::InvalidArgument(
                     "Field ", output_idx,
                     " in record is not a valid float: ", field);
@@ -412,7 +617,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                   dataset()->record_defaults_[output_idx].flat<double>()(0);
             } else {
               double value;
-              if (!strings::safe_strtod(field.c_str(), &value)) {
+              if (!strings::safe_strtod(field, &value)) {
                 return errors::InvalidArgument(
                     "Field ", output_idx,
                     " in record is not a valid double: ", field);
@@ -426,7 +631,7 @@ class CSVDatasetOp : public DatasetOpKernel {
               component.scalar<string>()() =
                   dataset()->record_defaults_[output_idx].flat<string>()(0);
             } else {
-              component.scalar<string>()() = std::move(field);
+              component.scalar<string>()() = field.ToString();
             }
             break;
           }
@@ -439,6 +644,50 @@ class CSVDatasetOp : public DatasetOpKernel {
         return Status::OK();
       }
 
+      // Records can be delimited by "\r\n" line breaks. When we encounter a
+      // '\r', we have to check the next character to see if it is part of the
+      // linebreak, and ignore it if so.
+      void SkipNewLineIfNecessary() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (pos_ >= buffer_.size()) {
+          Status s = FillBuffer(&buffer_);
+          pos_ = 0;
+          // If we failed to fill buffer, it doesn't matter because we're done
+          // with the record
+          if (!s.ok()) return;
+        }
+        if (buffer_[pos_] == '\n') {
+          pos_++;
+        }
+      }
+
+      // Given a string field, and its index in the output,
+      // converts it to a Tensor of the right type and adds it to the
+      // out_tensors vector.
+      Status UnquotedFieldToOutput(IteratorContext* ctx, StringPiece field,
+                                   std::vector<Tensor>* out_tensors,
+                                   const std::vector<Piece>& earlier_pieces,
+                                   bool include) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (!include) return Status::OK();
+
+        if (earlier_pieces.empty()) {
+          return FieldToOutput(ctx, field, out_tensors);
+        }
+
+        size_t str_len = field.size();
+        for (const Piece& p : earlier_pieces) {
+          str_len += p.len;
+        }
+        string field_complete;
+        field_complete.reserve(str_len);
+
+        for (const Piece& p : earlier_pieces) {
+          field_complete.append(p.buffer, p.start, p.len);
+        }
+
+        field_complete.append(field.data(), field.size());
+        return FieldToOutput(ctx, field_complete, out_tensors);
+      }
+
       // Sets up reader streams to read from the file at `current_file_index_`.
       Status SetupStreamsLocked(Env* env) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (current_file_index_ >= dataset()->filenames_.size()) {
@@ -452,16 +701,18 @@ class CSVDatasetOp : public DatasetOpKernel {
             dataset()->filenames_[current_file_index_], &file_));
         input_stream_.reset(
             new io::RandomAccessInputStream(file_.get(), false));
-        // TODO(rachelim): Maintain our own buffer so we don't read every record
-        //   twice
-        buffered_input_stream_.reset(new io::BufferedInputStream(
-            input_stream_.get(), dataset()->buffer_size_, false));
+        buffer_.clear();
+        pos_ = 0;
         if (dataset()->header_) {
-          // Ignore header line
-          string str;
-          Status s = buffered_input_stream_->ReadLine(&str);
-          if (errors::IsOutOfRange(s)) {
-            return errors::InvalidArgument("Can't read header of empty file");
+          // Read one line, but don't include it. Pass nullptrs as dummy
+          // pointers to objects that shouldn't be invoked anyway
+          // We need to process this as a record here instead of just finding
+          // the first newline because it might contain quoted fields with
+          // newlines in the header as well
+          std::vector<int64> empty;
+          Status s = ReadRecord(nullptr, nullptr, false, empty);
+          if (!s.ok()) {
+            return errors::InvalidArgument("Can't read header of file");
           }
         }
         return Status::OK();
@@ -470,15 +721,15 @@ class CSVDatasetOp : public DatasetOpKernel {
       // Resets all reader streams.
       void ResetStreamsLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         input_stream_.reset();
-        buffered_input_stream_.reset();
         file_.reset();
       }
 
       mutex mu_;
+      string buffer_ GUARDED_BY(mu_);  // Maintain our own buffer
+      size_t pos_ GUARDED_BY(
+          mu_);  // Index into the buffer must be maintained between iters
       std::unique_ptr<io::RandomAccessInputStream> input_stream_
           GUARDED_BY(mu_);
-      std::unique_ptr<io::BufferedInputStream> buffered_input_stream_
-          GUARDED_BY(mu_);
       size_t current_file_index_ GUARDED_BY(mu_) = 0;
       std::unique_ptr<RandomAccessFile> file_
           GUARDED_BY(mu_);  // must outlive input_stream_
@@ -491,7 +742,6 @@ class CSVDatasetOp : public DatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const std::vector<Tensor> record_defaults_;
     const std::vector<int64> select_cols_;
-    const bool select_all_cols_;
     const bool use_quote_delim_;
     const char delim_;
     const string na_value_;
diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index c483a43769..523d1f2f71 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -128,6 +128,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:error_ops",
         "//tensorflow/contrib/data/python/ops:readers",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index 8c138c7081..74b90ec7d1 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -25,6 +25,7 @@ import time
 
 import numpy as np
 
+from tensorflow.contrib.data.python.ops import error_ops
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.client import session
 from tensorflow.python.data.ops import readers as core_readers
@@ -61,12 +62,12 @@ class CsvDatasetOpTest(test.TestCase):
         op2 = sess.run(next2)
         self.assertAllEqual(op1, op2)
 
-  def setup_files(self, inputs):
+  def setup_files(self, inputs, linebreak='\n'):
     filenames = []
     for i, ip in enumerate(inputs):
-      fn = os.path.join(self.get_temp_dir(), 'temp_%d.txt' % i)
-      with open(fn, 'w') as f:
-        f.write('\n'.join(ip))
+      fn = os.path.join(self.get_temp_dir(), 'temp_%d.csv' % i)
+      with open(fn, 'wb') as f:
+        f.write(linebreak.join(ip).encode('utf-8'))
       filenames.append(fn)
     return filenames
 
@@ -86,38 +87,47 @@ class CsvDatasetOpTest(test.TestCase):
           inputs, **kwargs)
       self._assert_datasets_equal(g, dataset_actual, dataset_expected)
 
+  def _verify_output_or_err(self,
+                            sess,
+                            dataset,
+                            expected_output=None,
+                            expected_err_re=None):
+    nxt = dataset.make_one_shot_iterator().get_next()
+    if expected_err_re is None:
+      # Verify that output is expected, without errors
+      expected_output = [[
+          v.encode('utf-8') if isinstance(v, str) else v for v in op
+      ] for op in expected_output]
+      for value in expected_output:
+        op = sess.run(nxt)
+        self.assertAllEqual(op, value)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(nxt)
+    else:
+      # Verify that OpError is produced as expected
+      with self.assertRaisesOpError(expected_err_re):
+        while True:
+          try:
+            sess.run(nxt)
+          except errors.OutOfRangeError:
+            break
+
   def _test_dataset(self,
                     inputs,
                     expected_output=None,
                     expected_err_re=None,
+                    linebreak='\n',
                     **kwargs):
     """Checks that elements produced by CsvDataset match expected output."""
     # Convert str type because py3 tf strings are bytestrings
-    filenames = self.setup_files(inputs)
+    filenames = self.setup_files(inputs, linebreak)
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
         dataset = readers.CsvDataset(filenames, **kwargs)
-        nxt = dataset.make_one_shot_iterator().get_next()
-        if expected_err_re is None:
-          # Verify that output is expected, without errors
-          expected_output = [[
-              v.encode('utf-8') if isinstance(v, str) else v for v in op
-          ] for op in expected_output]
-          for value in expected_output:
-            op = sess.run(nxt)
-            self.assertAllEqual(op, value)
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(nxt)
-        else:
-          # Verify that OpError is produced as expected
-          with self.assertRaisesOpError(expected_err_re):
-            while True:
-              try:
-                sess.run(nxt)
-              except errors.OutOfRangeError:
-                break
-
-  def testCsvDataset_floatRequired(self):
+        self._verify_output_or_err(sess, dataset, expected_output,
+                                   expected_err_re)
+
+  def testCsvDataset_requiredFields(self):
     record_defaults = [[]] * 4
     inputs = [['1,2,3,4']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
@@ -137,10 +147,36 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['1.0,2.1,hello,4.3', '5.4,6.5,goodbye,8.7']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
 
-  def testCsvDataset_withQuoted(self):
-    record_defaults = [['']] * 4
-    inputs = [['1.0,2.1,"hello, it is me",4.3', '5.4,6.5,goodbye,8.7']]
-    self._test_by_comparison(inputs, record_defaults=record_defaults)
+  def testCsvDataset_withEmptyFields(self):
+    record_defaults = [[0]] * 4
+    inputs = [[',,,', '1,1,1,', ',2,2,2']]
+    self._test_dataset(
+        inputs, [[0, 0, 0, 0], [1, 1, 1, 0], [0, 2, 2, 2]],
+        record_defaults=record_defaults)
+
+  def testCsvDataset_errWithUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='Unquoted fields cannot have quotes inside',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_ignoreErrWithUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4', 'a,b,c"d', 'e,f,g']]
+    filenames = self.setup_files(inputs)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
+        dataset = dataset.apply(error_ops.ignore_errors())
+        self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
+
+  def testCsvDataset_withNoQuoteDelimAndUnquotedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,2"3,4']]
+    self._test_by_comparison(
+        inputs, record_defaults=record_defaults, use_quote_delim=False)
 
   def testCsvDataset_mixedTypes(self):
     record_defaults = [
@@ -164,11 +200,6 @@ class CsvDatasetOpTest(test.TestCase):
     self._test_by_comparison(
         inputs, record_defaults=record_defaults, field_delim=':')
 
-  def testCsvDataset_withEmptyValues(self):
-    record_defaults = [[0]] * 4
-    inputs = [['1,,3,4', ',6,7,8']]
-    self._test_by_comparison(inputs, record_defaults=record_defaults)
-
   def testCsvDataset_withNaValue(self):
     record_defaults = [[0]] * 4
     inputs = [['1,NA,3,4', 'NA,6,7,8']]
@@ -176,8 +207,8 @@ class CsvDatasetOpTest(test.TestCase):
         inputs, record_defaults=record_defaults, na_value='NA')
 
   def testCsvDataset_withSelectCols(self):
-    record_defaults = [[0]] * 2
-    inputs = [['1,2,3,4', '5,6,7,8']]
+    record_defaults = [['']] * 2
+    inputs = [['1,2,3,4', '"5","6","7","8"']]
     self._test_by_comparison(
         inputs, record_defaults=record_defaults, select_cols=[1, 2])
 
@@ -190,27 +221,17 @@ class CsvDatasetOpTest(test.TestCase):
         record_defaults=record_defaults,
         select_cols=[3, 4])
 
+  def testCsvDataset_withOneCol(self):
+    record_defaults = [['NA']]
+    inputs = [['0', '', '2']]
+    self._test_dataset(
+        inputs, [['0'], ['NA'], ['2']], record_defaults=record_defaults)
+
   def testCsvDataset_withMultipleFiles(self):
     record_defaults = [[0]] * 4
     inputs = [['1,2,3,4', '5,6,7,8'], ['5,6,7,8']]
     self._test_by_comparison(inputs, record_defaults=record_defaults)
 
-  def testCsvDataset_withNewLine(self):
-    # In this case, we expect it to behave differently from
-    # TextLineDataset->map(decode_csv) since that flow has bugs
-    record_defaults = [['']] * 4
-    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
-    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
-    self._test_dataset(inputs, expected, record_defaults=record_defaults)
-
-  def testCsvDataset_withMultipleNewLines(self):
-    # In this case, we expect it to behave differently from
-    # TextLineDataset->map(decode_csv) since that flow has bugs
-    record_defaults = [['']] * 4
-    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
-    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
-    self._test_dataset(inputs, expected, record_defaults=record_defaults)
-
   def testCsvDataset_withLeadingAndTrailingSpaces(self):
     record_defaults = [[0.0]] * 4
     inputs = [['0, 1, 2, 3']]
@@ -266,9 +287,10 @@ class CsvDatasetOpTest(test.TestCase):
   def testCsvDataset_errorWithHeaderEmptyFile(self):
     record_defaults = [[0]] * 2
     inputs = [[]]
+    expected_err_re = "Can't read header of file"
     self._test_dataset(
         inputs,
-        expected_err_re="Can't read header of empty file",
+        expected_err_re=expected_err_re,
         record_defaults=record_defaults,
         header=True,
     )
@@ -284,7 +306,7 @@ class CsvDatasetOpTest(test.TestCase):
     inputs = [['', '1,2']]  # First record is empty
     self._test_dataset(
         inputs,
-        expected_err_re='Expect 2 fields but have 0 in record',
+        expected_err_re='Expect 2 fields but have 1 in record',
         record_defaults=record_defaults)
 
   def testCsvDataset_withChainedOps(self):
@@ -301,7 +323,7 @@ class CsvDatasetOpTest(test.TestCase):
 
   def testCsvDataset_withTypeDefaults(self):
     # Testing using dtypes as record_defaults for required fields
-    record_defaults = [dtypes.float32, dtypes.float32]
+    record_defaults = [dtypes.float32, [0.0]]
     inputs = [['1.0,2.0', '3.0,4.0']]
     self._test_dataset(
         inputs,
@@ -326,6 +348,162 @@ class CsvDatasetOpTest(test.TestCase):
 
     self.assertEqual(result, sorted(result))
 
+## The following tests exercise parsing logic for quoted fields
+
+  def testCsvDataset_withQuoted(self):
+    record_defaults = [['']] * 4
+    inputs = [['"a","b","c :)","d"', '"e","f","g :(","h"']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+  def testCsvDataset_withOneColAndQuotes(self):
+    record_defaults = [['']]
+    inputs = [['"0"', '"1"', '"2"']]
+    self._test_dataset(
+        inputs, [['0'], ['1'], ['2']], record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLine(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,b,"""c""\n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b', '"c"\n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_withNewLineInUnselectedCol(self):
+    record_defaults = [['']]
+    inputs = [['1,"2\n3",4', '5,6,7']]
+    self._test_dataset(
+        inputs,
+        expected_output=[['1'], ['5']],
+        record_defaults=record_defaults,
+        select_cols=[0])
+
+  def testCsvDataset_withMultipleNewLines(self):
+    # In this case, we expect it to behave differently from
+    # TextLineDataset->map(decode_csv) since that flow has bugs
+    record_defaults = [['']] * 4
+    inputs = [['a,"b\n\nx","""c""\n \n0","d\ne"', 'f,g,h,i']]
+    expected = [['a', 'b\n\nx', '"c"\n \n0', 'd\ne'], ['f', 'g', 'h', 'i']]
+    self._test_dataset(inputs, expected, record_defaults=record_defaults)
+
+  def testCsvDataset_errorWithTerminateMidRecord(self):
+    record_defaults = [['']] * 4
+    inputs = [['a,b,c,"a']]
+    self._test_dataset(
+        inputs,
+        expected_err_re=
+        'Reached end of file without closing quoted field in record',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_withEscapedQuotes(self):
+    record_defaults = [['']] * 4
+    inputs = [['1.0,2.1,"she said: ""hello""",4.3', '5.4,6.5,goodbye,8.7']]
+    self._test_by_comparison(inputs, record_defaults=record_defaults)
+
+
+## Testing that parsing works with all buffer sizes, quoted/unquoted fields,
+## and different types of line breaks
+
+  def testCsvDataset_withInvalidBufferSize(self):
+    record_defaults = [['']] * 4
+    inputs = [['a,b,c,d']]
+    self._test_dataset(
+        inputs,
+        expected_err_re='buffer_size should be positive',
+        record_defaults=record_defaults,
+        buffer_size=0)
+
+  def testCsvDataset_withBufferSize(self):
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs, expected, record_defaults=record_defaults, buffer_size=i + 1)
+
+  def testCsvDataset_withCR(self):
+    # Test that when the line separator is '\r', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+
+  def testCsvDataset_withCRLF(self):
+    # Test that when the line separator is '\r\n', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['abc,def,ghi', '0,1,2', ',,']]
+    expected = [['abc', 'def', 'ghi'], ['0', '1', '2'], ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+
+  def testCsvDataset_withBufferSizeAndQuoted(self):
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\n', record_defaults=record_defaults)
+
+  def testCsvDataset_withCRAndQuoted(self):
+    # Test that when the line separator is '\r', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\r', record_defaults=record_defaults)
+
+  def testCsvDataset_withCRLFAndQuoted(self):
+    # Test that when the line separator is '\r\n', parsing works with all buffer
+    # sizes
+    record_defaults = [['NA']] * 3
+    inputs = [['"\n\n\n","\r\r\r","abc"', '"0","1","2"', '"","",""']]
+    expected = [['\n\n\n', '\r\r\r', 'abc'], ['0', '1', '2'],
+                ['NA', 'NA', 'NA']]
+    for i in range(20):
+      # Test a range of buffer sizes that should all work
+      self._test_dataset(
+          inputs,
+          expected,
+          linebreak='\r\n',
+          record_defaults=record_defaults,
+          buffer_size=i + 1)
+    self._test_dataset(
+        inputs, expected, linebreak='\r\n', record_defaults=record_defaults)
+
 
 class CsvDatasetBenchmark(test.Benchmark):
   """Benchmarks for the various ways of creating a dataset from CSV files.
@@ -343,7 +521,7 @@ class CsvDatasetBenchmark(test.Benchmark):
     self._filenames = []
     for n in self._num_cols:
       fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
-      with open(fn, 'w') as f:
+      with open(fn, 'wb') as f:
         # Just write 100 rows and use `repeat`... Assumes the cost
         # of creating an iterator is not significant
         row = ','.join([str_val for _ in range(n)])
diff --git a/tensorflow/core/lib/strings/numbers.cc b/tensorflow/core/lib/strings/numbers.cc
index 987e4fe733..87aa5915ff 100644
--- a/tensorflow/core/lib/strings/numbers.cc
+++ b/tensorflow/core/lib/strings/numbers.cc
@@ -331,31 +331,29 @@ bool safe_strtou32(StringPiece str, uint32* value) {
   return true;
 }
 
-bool safe_strtof(const char* str, float* value) {
+bool safe_strtof(StringPiece str, float* value) {
   int processed_characters_count = -1;
-  auto len = str_util::Strnlen(str, kFastToBufferSize);
+  auto len = str.size();
 
-  // If there is no zero-termination in str, fail.
-  if (len == kFastToBufferSize) return false;
-  // If string length exceeds int max, fail.
+  // If string length exceeds buffer size or int max, fail.
+  if (len >= kFastToBufferSize) return false;
   if (len > std::numeric_limits<int>::max()) return false;
 
-  *value = StringToFloatConverter().StringToFloat(str, static_cast<int>(len),
-                                                  &processed_characters_count);
+  *value = StringToFloatConverter().StringToFloat(
+      str.data(), static_cast<int>(len), &processed_characters_count);
   return processed_characters_count > 0;
 }
 
-bool safe_strtod(const char* str, double* value) {
+bool safe_strtod(StringPiece str, double* value) {
   int processed_characters_count = -1;
-  auto len = str_util::Strnlen(str, kFastToBufferSize);
+  auto len = str.size();
 
-  // If there is no zero-termination in str, fail.
-  if (len == kFastToBufferSize) return false;
-  // If string length exceeds int max, fail.
+  // If string length exceeds buffer size or int max, fail.
+  if (len >= kFastToBufferSize) return false;
   if (len > std::numeric_limits<int>::max()) return false;
 
-  *value = StringToFloatConverter().StringToDouble(str, static_cast<int>(len),
-                                                   &processed_characters_count);
+  *value = StringToFloatConverter().StringToDouble(
+      str.data(), static_cast<int>(len), &processed_characters_count);
   return processed_characters_count > 0;
 }
 
diff --git a/tensorflow/core/lib/strings/numbers.h b/tensorflow/core/lib/strings/numbers.h
index 9cb56415cb..1d5bacac93 100644
--- a/tensorflow/core/lib/strings/numbers.h
+++ b/tensorflow/core/lib/strings/numbers.h
@@ -115,13 +115,13 @@ bool safe_strtou64(StringPiece str, uint64* value);
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
-bool safe_strtof(const char* str, float* value);
+bool safe_strtof(StringPiece str, float* value);
 
 // Convert strings to double precision floating point values.
 // Leading and trailing spaces are allowed.
 // Values may be rounded on over- and underflow.
 // Returns false on invalid input or if `strlen(value) >= kFastToBufferSize`.
-bool safe_strtod(const char* str, double* value);
+bool safe_strtod(StringPiece str, double* value);
 
 inline bool ProtoParseNumeric(StringPiece s, int32* value) {
   return safe_strto32(s, value);
-- 
GitLab


From 662c5dd7734363766a499d2c7a2013b4e4787974 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Fri, 1 Jun 2018 09:07:05 -0700
Subject: [PATCH 167/610] remove typo

PiperOrigin-RevId: 198880096
---
 tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
index 4656afe025..cec5b717f8 100644
--- a/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
+++ b/tensorflow/tools/ci_build/windows/gpu/cmake/run_build.bat
@@ -30,7 +30,6 @@ IF DEFINED SWIG_EXE (ECHO SWIG_EXE is set to %SWIG_EXE%) ELSE (SET SWIG_EXE="C:\
 IF DEFINED PY_EXE (ECHO PY_EXE is set to %PY_EXE%) ELSE (SET PY_EXE="C:\Program Files\Anaconda3\python.exe")
 IF DEFINED PY_LIB (ECHO PY_LIB is set to %PY_LIB%) ELSE (SET PY_LIB="C:\Program Files\Anaconda3\libs\python35.lib")
 IF DEFINED CUDNN_HOME (ECHO CUDNN_HOME is set to %CUDNN_HOME%) ELSE (SET CUDNN_HOME="c:\tools\cuda")
-verbosity:quiet
 IF DEFINED DISABLE_FORCEINLINE (ECHO DISABLE_FORCEINLINE is set to %DISABLE_FORCEINLINE%) ELSE (SET DISABLE_FORCEINLINE="OFF")
 
 SET CMAKE_DIR=%REPO_ROOT%\tensorflow\contrib\cmake
-- 
GitLab


From 6a7cd2e871d60c675c30b9f0bbe1af8e78b89373 Mon Sep 17 00:00:00 2001
From: Benoit Steiner <bsteiner@google.com>
Date: Fri, 1 Jun 2018 09:59:49 -0700
Subject: [PATCH 168/610] Fixed a bug introduced by cl/197941474.

PiperOrigin-RevId: 198886485
---
 tensorflow/core/grappler/optimizers/constant_folding.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index 7f0c2a2116..f4b384ec1e 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -2185,8 +2185,8 @@ bool ConstantFolding::SimplifyPack(GraphDef* optimized_graph, NodeDef* node) {
     node->add_input(axis_node->name());
     if (node->input_size() > 2) {
       node->mutable_input()->SwapElements(1, node->input_size() - 1);
-      return true;
     }
+    return true;
   }
   return false;
 }
-- 
GitLab


From dae529b6cb2a9e0dc9f1f14bed1561d98adf37ca Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Fri, 1 Jun 2018 10:08:35 -0700
Subject: [PATCH 169/610] Fix ProfileSummarizer build, use properly qualified
 string references.

PiperOrigin-RevId: 198887868
---
 .../lite/profiling/profile_summarizer.cc        | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.cc b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
index 788f6922d2..6f2c9cd2b3 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
@@ -26,21 +26,22 @@ namespace {
 using Detail = tensorflow::StatsCalculator::Detail;
 
 struct OperatorDetails {
-  string name;
-  std::vector<string> inputs;
-  std::vector<string> outputs;
+  std::string name;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
 };
 
-string GetTensorName(const tflite::Interpreter& interpreter, int tensor_index) {
+std::string GetTensorName(const tflite::Interpreter& interpreter,
+                          int tensor_index) {
   const auto tensor = interpreter.tensor(tensor_index);
   if (tensor == nullptr || tensor->name == nullptr) {
     return "Unknown";
   }
   return tensor->name;
 }
-std::vector<string> GetTensorNames(const tflite::Interpreter& interpreter,
-                                   const TfLiteIntArray* tensor_indices) {
-  std::vector<string> tensors;
+std::vector<std::string> GetTensorNames(const tflite::Interpreter& interpreter,
+                                        const TfLiteIntArray* tensor_indices) {
+  std::vector<std::string> tensors;
   tensors.reserve(tensor_indices->size);
   for (int i = 0; i < tensor_indices->size; i++) {
     tensors.push_back(GetTensorName(interpreter, tensor_indices->data[i]));
@@ -48,7 +49,7 @@ std::vector<string> GetTensorNames(const tflite::Interpreter& interpreter,
   return tensors;
 }
 
-string ToString(const std::vector<string>& str_vector) {
+std::string ToString(const std::vector<std::string>& str_vector) {
   std::stringstream stream;
   stream << "[";
   bool first = true;
-- 
GitLab


From 72314bff0ca2131a87b349abe214c4e5d3d6e334 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 1 Jun 2018 10:33:02 -0700
Subject: [PATCH 170/610] Add a dependency optimization that eliminates
 multiple cross-device control edges to a single node from the same source
 device. Instead, build an intermediate NoOp node on the source device and use
 a single cross-device control edge.

PiperOrigin-RevId: 198891614
---
 tensorflow/core/grappler/optimizers/BUILD     |  2 +
 .../optimizers/dependency_optimizer.cc        | 84 +++++++++++++++++++
 .../optimizers/dependency_optimizer.h         |  7 +-
 .../optimizers/dependency_optimizer_test.cc   | 66 ++++++++++++++-
 4 files changed, 157 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index c90667abad..0e22d4add8 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -328,11 +328,13 @@ tf_cuda_cc_test(
         ":model_pruner",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
+        "//tensorflow/core/grappler/utils:grappler_test",
         "//tensorflow/core/grappler/utils:topological_sort",
     ],
 )
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index 200454b522..fb2aea3b3d 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -557,6 +557,86 @@ void DependencyOptimizer::BuildNodeToIdx() {
   }
 }
 
+// Suppose there are cross-device control inputs to node C from multiple nodes
+// that are located on another device, e.g., we have control edges:
+// A->C, B->C
+// where A and B are on device X and C is on device Y.
+// We can reduce cross-device communication by introducing an intermediate
+// NoOp node C' on device X and rewriting the control edges to:
+// A->C', B->C', C' -> C
+void DependencyOptimizer::GroupCrossDeviceControlEdges() {
+  const int num_nodes = optimized_graph_->node_size();
+  for (int i = 0; i < num_nodes; ++i) {
+    NodeDef* node = optimized_graph_->mutable_node(i);
+    if (node->device().empty()) continue;
+
+    // Creates new noop nodes for devices on which multiple control inputs are
+    // located.
+
+    // Map keyed by device name to the newly introduced Noop node for that
+    // device. A nullptr value means that we have only seen a single node on
+    // that device.
+    std::map<string, NodeDef*> noops;
+    int num_noops = 0;
+    for (int j = 0; j < node->input_size(); ++j) {
+      if (IsControlInput(node->input(j))) {
+        const NodeDef* input = node_map_->GetNode(node->input(j));
+        if (!input->device().empty() && input->device() != node->device()) {
+          auto emplace_result = noops.emplace(input->device(), nullptr);
+          if (!emplace_result.second &&
+              emplace_result.first->second == nullptr) {
+            // This is the second cross-device control input from the same
+            // device. Creates an intermediate noop node on that device.
+            string group_name;
+            NodeDef* noop;
+            // Creates a fresh node name; there may be conflicting names from
+            // a previous iteration of the optimizer.
+            do {
+              group_name = AddPrefixToNodeName(
+                  node->name(),
+                  strings::StrCat("GroupCrossDeviceControlEdges_", num_noops));
+              noop = node_map_->GetNode(group_name);
+              ++num_noops;
+            } while (noop != nullptr);
+            noop = optimized_graph_->add_node();
+            noop->set_name(group_name);
+            noop->set_device(input->device());
+            noop->set_op("NoOp");
+            node_map_->AddNode(noop->name(), noop);
+            emplace_result.first->second = noop;
+          }
+        }
+      }
+    }
+
+    // Reroute existing control edges to go via the newly introduced NoOp nodes.
+    int pos = 0;
+    while (pos < node->input_size()) {
+      const string& input_name = node->input(pos);
+      if (IsControlInput(input_name)) {
+        NodeDef* input = node_map_->GetNode(input_name);
+        auto it = noops.find(input->device());
+        if (it == noops.end() || it->second == nullptr) {
+          ++pos;
+        } else {
+          node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+          node->mutable_input()->RemoveLast();
+          it->second->add_input(AsControlDependency(*input));
+          node_map_->UpdateOutput(input_name, node->name(), it->second->name());
+        }
+      } else {
+        ++pos;
+      }
+    }
+    for (const auto& entry : noops) {
+      if (entry.second) {
+        node->add_input(AsControlDependency(*entry.second));
+        node_map_->AddOutput(entry.second->name(), node->name());
+      }
+    }
+  }
+}
+
 Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                      GraphDef* optimized_graph) {
   optimized_graph_ = optimized_graph;
@@ -588,6 +668,10 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
     // Dedup control inputs.
     CleanControlInputs();
+
+    if (opt_level_ == RewriterConfig::AGGRESSIVE) {
+      GroupCrossDeviceControlEdges();
+    }
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.h b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
index b4db98125a..c97ff23e88 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.h
@@ -30,7 +30,8 @@ namespace grappler {
 class DependencyOptimizer : public GraphOptimizer {
  public:
   DependencyOptimizer() {}
-  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level) {}
+  explicit DependencyOptimizer(RewriterConfig::Toggle opt_level)
+      : opt_level_(opt_level) {}
   ~DependencyOptimizer() override {}
 
   string name() const override { return "dependency_optimizer"; };
@@ -61,7 +62,11 @@ class DependencyOptimizer : public GraphOptimizer {
   Status TransitiveReduction();
   // Main driver of dependency optimizations.
   Status OptimizeDependencies();
+  // Replaces multiple cross-device control edges from the same device with a
+  // single control edge.
+  void GroupCrossDeviceControlEdges();
 
+  RewriterConfig::Toggle opt_level_;
   bool fetch_nodes_known_;
   std::unordered_set<string> nodes_to_preserve_;
   std::unique_ptr<NodeMap> node_map_;
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 6a297da52d..931d073cd3 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -16,11 +16,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/dependency_optimizer.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/model_pruner.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,7 +31,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class DependencyOptimizerTest : public ::testing::Test {};
+class DependencyOptimizerTest : public GrapplerTest {};
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
                        const GraphDef& optimized_graph, const string& func) {
@@ -722,6 +724,68 @@ TEST_F(DependencyOptimizerTest, RemoveGreaterEqualWithNoOp) {
   EXPECT_EQ(3, count);
 }
 
+TEST_F(DependencyOptimizerTest, GroupCrossDeviceControlDeps) {
+  GrapplerItem item;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::RandomUniform(s.WithOpName("a").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output b = ops::RandomUniform(s.WithOpName("b").WithDevice("/CPU:2"),
+                                  {1, 2}, DT_FLOAT);
+    Output c = ops::RandomUniform(s.WithOpName("c").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output d = ops::RandomUniform(s.WithOpName("d").WithDevice("/CPU:3"),
+                                  {1, 2}, DT_FLOAT);
+    Output e = ops::RandomUniform(s.WithOpName("e").WithDevice("/CPU:0"),
+                                  {1, 2}, DT_FLOAT);
+    // Node with cross-device dependencies.
+    auto fetch = ops::Identity(
+        s.WithOpName("f")
+            .WithControlDependencies({a.op(), b.op(), c.op(), d.op()})
+            .WithDevice("/GPU:0"),
+        {e});
+
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+    item.fetch.push_back("f");
+  }
+
+  GraphDef expected;
+  {
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    Output a = ops::RandomUniform(s.WithOpName("a").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output b = ops::RandomUniform(s.WithOpName("b").WithDevice("/CPU:2"),
+                                  {1, 2}, DT_FLOAT);
+    Output c = ops::RandomUniform(s.WithOpName("c").WithDevice("/CPU:1"),
+                                  {1, 2}, DT_FLOAT);
+    Output d = ops::RandomUniform(s.WithOpName("d").WithDevice("/CPU:3"),
+                                  {1, 2}, DT_FLOAT);
+    Output e = ops::RandomUniform(s.WithOpName("e").WithDevice("/CPU:0"),
+                                  {1, 2}, DT_FLOAT);
+    auto noop = ops::NoOp(s.WithOpName("GroupCrossDeviceControlEdges_0/f")
+                              .WithDevice("/CPU:1")
+                              .WithControlDependencies({a.op(), c.op()}));
+    auto fetch =
+        ops::Identity(s.WithOpName("f")
+                          .WithControlDependencies({b.op(), d.op(), noop})
+                          .WithDevice("/GPU:0"),
+                      {e});
+
+    TF_CHECK_OK(s.ToGraphDef(&expected));
+  }
+
+  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(expected, output);
+
+  // Run the optimizer again to verify idempotence.
+  item.graph.Swap(&output);
+  output.Clear();
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  CompareGraphs(expected, output);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
-- 
GitLab


From bb94c57a7fe63063e70f7e9984b7ec9507396d5e Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Fri, 1 Jun 2018 10:38:19 -0700
Subject: [PATCH 171/610] Fix bug in eager documentation.

When implementing a custom layer, it's necessary to call the Layer constructor
from the custom layer's constructor.

PiperOrigin-RevId: 198892503
---
 tensorflow/docs_src/programmers_guide/eager.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/eager.md b/tensorflow/docs_src/programmers_guide/eager.md
index 00d02b4455..b2bc3273b4 100644
--- a/tensorflow/docs_src/programmers_guide/eager.md
+++ b/tensorflow/docs_src/programmers_guide/eager.md
@@ -149,16 +149,17 @@ it to implement your own layer:
 ```py
 class MySimpleLayer(tf.keras.layers.Layer):
   def __init__(self, output_units):
+    super(MySimpleLayer, self).__init__()
     self.output_units = output_units
 
-  def build(self, input):
+  def build(self, input_shape):
     # The build method gets called the first time your layer is used.
     # Creating variables on build() allows you to make their shape depend
-    # on the input shape and hence remove the need for the user to specify
+    # on the input shape and hence removes the need for the user to specify
     # full shapes. It is possible to create variables during __init__() if
     # you already know their full shapes.
     self.kernel = self.add_variable(
-      "kernel", [input.shape[-1], self.output_units])
+      "kernel", [input_shape[-1], self.output_units])
 
   def call(self, input):
     # Override call() instead of __call__ so we can perform some bookkeeping.
-- 
GitLab


From 6b76b6453a268f874c189eb4843fbe1deee3ae5b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 10:41:35 -0700
Subject: [PATCH 172/610] Updates Interpreter to be initialized with a
 MappedByteBuffer for backward compatibility.

PiperOrigin-RevId: 198893078
---
 .../java/org/tensorflow/lite/Interpreter.java | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index 644ce4cb3e..fd1f0ffa68 100644
--- a/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/contrib/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -17,6 +17,7 @@ package org.tensorflow.lite;
 
 import java.io.File;
 import java.nio.ByteBuffer;
+import java.nio.MappedByteBuffer;
 import java.util.HashMap;
 import java.util.Map;
 import org.checkerframework.checker.nullness.qual.NonNull;
@@ -103,6 +104,27 @@ public final class Interpreter implements AutoCloseable {
     wrapper = new NativeInterpreterWrapper(byteBuffer, numThreads);
   }
 
+  /**
+   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file.
+   *
+   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
+   * Interpreter}.
+   */
+  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer) {
+    wrapper = new NativeInterpreterWrapper(mappedByteBuffer);
+  }
+
+  /**
+   * Initializes a {@code Interpreter} with a {@code MappedByteBuffer} to the model file and
+   * specifies the number of threads used for inference.
+   *
+   * <p>The {@code MappedByteBuffer} should remain unchanged after the construction of a {@code
+   * Interpreter}.
+   */
+  public Interpreter(@NonNull MappedByteBuffer mappedByteBuffer, int numThreads) {
+    wrapper = new NativeInterpreterWrapper(mappedByteBuffer, numThreads);
+  }
+
   /**
    * Runs model inference if the model takes only one input, and provides only one output.
    *
@@ -231,5 +253,14 @@ public final class Interpreter implements AutoCloseable {
     wrapper = null;
   }
 
+  @Override
+  protected void finalize() throws Throwable {
+    try {
+      close();
+    } finally {
+      super.finalize();
+    }
+  }
+
   NativeInterpreterWrapper wrapper;
 }
-- 
GitLab


From 46afa1f0e8a8b269054025aefe9a7d42290f8e8d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 10:49:48 -0700
Subject: [PATCH 173/610] Amend cluster resolver error to suggest oauth2client
 as a possible issue.

PiperOrigin-RevId: 198894470
---
 .../python/training/tpu_cluster_resolver.py              | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 880fca4ea6..d44e23aadc 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -170,10 +170,11 @@ class TPUClusterResolver(ClusterResolver):
 
     if service is None and should_resolve:
       if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient must be installed before using the '
-                          'TPU cluster resolver. Execute: `pip install '
-                          '--upgrade google-api-python-client` to install with '
-                          'pip.')
+        raise ImportError('googleapiclient and oauth2client must be installed '
+                          'before using the TPU cluster resolver. Execute: '
+                          '`pip install --upgrade google-api-python-client` '
+                          'and `pip install --upgrade oauth2lclient` to '
+                          'install with pip.')
 
       final_discovery_url = self._discoveryUrl() or discovery_url
       if final_discovery_url:
-- 
GitLab


From 229a6fbb72a9c2a19113b7bdd85c3662603b4218 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 11:06:22 -0700
Subject: [PATCH 174/610] Printing bools in graphviz.

PiperOrigin-RevId: 198897530
---
 tensorflow/contrib/lite/toco/dump_graphviz.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/dump_graphviz.cc b/tensorflow/contrib/lite/toco/dump_graphviz.cc
index 3aeebb14f1..8913b5c3ea 100644
--- a/tensorflow/contrib/lite/toco/dump_graphviz.cc
+++ b/tensorflow/contrib/lite/toco/dump_graphviz.cc
@@ -132,6 +132,12 @@ void AppendArrayVal(string* string, Array const& array, int index) {
       return;
     }
     AppendF(string, "%d", data[index]);
+  } else if (array.buffer->type == ArrayDataType::kBool) {
+    const auto& data = array.GetBuffer<ArrayDataType::kBool>().data;
+    if (index >= data.size()) {
+      return;
+    }
+    AppendF(string, "%d", data[index]);
   }
 }
 
-- 
GitLab


From 508860fa5b28827e9425db0b3462c0fa8ed34ae5 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Fri, 1 Jun 2018 11:34:57 -0700
Subject: [PATCH 175/610] [TF2XLA] Decompose resize bilinear with large filters
 to work on dimensions indpendently.

PiperOrigin-RevId: 198902279
---
 tensorflow/compiler/tests/image_ops_test.py   |  39 +++-
 .../tf2xla/kernels/image_resize_ops.cc        | 183 +++++++++++++-----
 2 files changed, 168 insertions(+), 54 deletions(-)

diff --git a/tensorflow/compiler/tests/image_ops_test.py b/tensorflow/compiler/tests/image_ops_test.py
index 42e637734c..7cf953ef25 100644
--- a/tensorflow/compiler/tests/image_ops_test.py
+++ b/tensorflow/compiler/tests/image_ops_test.py
@@ -65,9 +65,7 @@ class RGBToHSVTest(XLATestCase):
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
         batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2],
-                                                {
-                                                    batch0: inp
-                                                })
+                                                {batch0: inp})
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
@@ -401,9 +399,7 @@ class AdjustSaturationTest(XLATestCase):
           x = array_ops.placeholder(dtypes.float32, shape=x_shape)
           with self.test_scope():
             y_fused = self._adjust_saturation(x,
-                                              scale).eval(feed_dict={
-                                                  x: x_np
-                                              })
+                                              scale).eval(feed_dict={x: x_np})
           self.assertAllClose(y_fused, y_baseline, rtol=2e-5, atol=1e-5)
 
 
@@ -412,7 +408,8 @@ class ResizeBilinearTest(XLATestCase):
   def _assertForwardOpMatchesExpected(self,
                                       image_np,
                                       target_shape,
-                                      expected=None):
+                                      expected=None,
+                                      large_tolerance=False):
     if expected is None:
       self.fail("expected must be specified")
     with self.test_session() as sess, self.test_scope():
@@ -420,7 +417,11 @@ class ResizeBilinearTest(XLATestCase):
       resized = gen_image_ops.resize_bilinear(
           image, target_shape, align_corners=True)
       out = sess.run(resized, {image: image_np[np.newaxis, :, :, np.newaxis]})
-      self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
+      if large_tolerance:
+        self.assertAllClose(
+            expected[np.newaxis, :, :, np.newaxis], out, rtol=0.03, atol=0.1)
+      else:
+        self.assertAllClose(expected[np.newaxis, :, :, np.newaxis], out)
 
   def _assertBackwardOpMatchesExpected(self,
                                        grads_np,
@@ -555,6 +556,28 @@ class ResizeBilinearTest(XLATestCase):
               [[12.5, 27.5, 21.875], [42.5, 80.0, 57.5], [40.625, 72.5, 50]],
               dtype=np.float32))
 
+  def testAlignCorners4x4To8x8(self):
+    self._assertForwardOpMatchesExpected(
+        (np.array([[0, 1, 2, 3]], dtype=np.float32) + np.array(
+            [[0], [1], [2], [3]], dtype=np.float32)) * 7.0, [8, 8],
+        expected=3 *
+        (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array(
+            [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)),
+        large_tolerance=True)
+
+  def testAlignCorners8x8To16x16(self):
+    self._assertForwardOpMatchesExpected(
+        (np.array([[0, 1, 2, 3, 4, 5, 6, 7]], dtype=np.float32) + np.array(
+            [[0], [1], [2], [3], [4], [5], [6], [7]], dtype=np.float32)) * 15.0,
+        [16, 16],
+        expected=7 * (np.array(
+            [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]],
+            dtype=np.float32) + np.array(
+                [[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11],
+                 [12], [13], [14], [15]],
+                dtype=np.float32)),
+        large_tolerance=True)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 9058cbc747..91bff995a1 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -99,27 +99,34 @@ ResizeConvolutionDims ComputeResizeConvolutionParameters(
   return dims;
 }
 
+// Form a 2D convolution kernel like:
+//       1 2 3 2 1
+//       2 4 6 4 2
+// 1/9 * 3 6 9 6 3
+//       2 4 6 4 2
+//       1 2 3 2 1
+// by multiplying two 1D kernels of the form:
+// 1/3 * [1 2 3 2 1]
+// If the 2D kernel would be very large, the 1D kernel can be applied once in
+// each dimension due to the symmetry of the kernel along all axis to reduce the
+// computational intensity.
+std::vector<float> Make1DKernel(int64 n) {
+  std::vector<float> kernel(n * 2 - 1);
+  for (int64 i = 0; i < n; ++i) {
+    float v = (i + 1.0f) / n;
+    kernel[i] = v;
+    kernel[n * 2 - 2 - i] = v;
+  }
+  return kernel;
+}
+
+// Kernels with more than 16 spatial elements are considered intense and the
+// kernel should applied to each dimension independently.
+const int64 kMax2DKernelSize = 16;
+
 xla::XlaOp MakeBilinearResizeKernel(xla::XlaBuilder* builder,
                                     gtl::ArraySlice<int64> kernel_size,
                                     int64 channels) {
-  // Form a 2D convolution kernel like:
-  //       1 2 3 2 1
-  //       2 4 6 4 2
-  // 1/9 * 3 6 9 6 3
-  //       2 4 6 4 2
-  //       1 2 3 2 1
-  // by multiplying two 1D kernels of the form:
-  // 1/3 * [1 2 3 2 1]
-  auto make_1d_kernel = [](int64 n) {
-    std::vector<float> kernel(n * 2 - 1);
-    for (int64 i = 0; i < n; ++i) {
-      float v = (i + 1.0f) / n;
-      kernel[i] = v;
-      kernel[n * 2 - 2 - i] = v;
-    }
-    return kernel;
-  };
-
   xla::XlaOp channels_iota;
   // DT_INT32 Iota will always return status::OK().
   TF_CHECK_OK(
@@ -133,12 +140,37 @@ xla::XlaOp MakeBilinearResizeKernel(xla::XlaBuilder* builder,
       xla::PrimitiveType::F32);
   return builder->Mul(
       builder->Mul(diag,
-                   builder->ConstantR1<float>(make_1d_kernel(kernel_size[1])),
+                   builder->ConstantR1<float>(Make1DKernel(kernel_size[1])),
                    /*broadcast_dimensions=*/{1}),
-      builder->ConstantR1<float>(make_1d_kernel(kernel_size[0])),
+      builder->ConstantR1<float>(Make1DKernel(kernel_size[0])),
       /*broadcast_dimensions=*/{0});
 }
 
+xla::XlaOp MakeBilinearResizeKernelInDim(xla::XlaBuilder* builder,
+                                         gtl::ArraySlice<int64> kernel_size,
+                                         int64 channels, int64 dim) {
+  xla::XlaOp channels_iota;
+  // DT_INT32 Iota will always return status::OK().
+  TF_CHECK_OK(
+      XlaHelpers::Iota(builder, DataType::DT_INT32, channels, &channels_iota));
+
+  auto diag = builder->ConvertElementType(
+      builder->Eq(builder->Broadcast(
+                      channels_iota,
+                      {dim == 0 ? (2 * kernel_size[0] - 1) : 1,
+                       dim == 1 ? (2 * kernel_size[1] - 1) : 1, channels}),
+                  channels_iota, /*broadcast_dimensions=*/{2}),
+      xla::PrimitiveType::F32);
+  if (dim == 1) {
+    return builder->Mul(
+        diag, builder->ConstantR1<float>(Make1DKernel(kernel_size[1])),
+        /*broadcast_dimensions=*/{1});
+  }
+  return builder->Mul(diag,
+                      builder->ConstantR1<float>(Make1DKernel(kernel_size[0])),
+                      /*broadcast_dimensions=*/{0});
+}
+
 xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
                                              const xla::XlaOp& input,
                                              const int num_spatial_dims,
@@ -170,15 +202,37 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
 
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, out_size);
-  xla::XlaOp kernel =
-      MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
-  xla::XlaOp output = builder->ConvGeneralDilated(
-      input, kernel, dims.stride,
-      /*padding=*/
-      {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
-       {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
-      /*lhs_dilation=*/dims.kernel_size,
-      /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  xla::XlaOp output;
+  // Split convolutions into independent dimensions if they wmuld be a very
+  // large kernel.
+  if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
+    xla::XlaOp kernel =
+        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+    output = builder->ConvGeneralDilated(
+        input, kernel, dims.stride,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.kernel_size,
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  } else {
+    xla::XlaOp kernel0 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
+    output = builder->ConvGeneralDilated(
+        input, kernel0, {dims.stride[0], 1},
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}},
+        /*lhs_dilation=*/{dims.kernel_size[0], 1},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+    xla::XlaOp kernel1 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
+    output = builder->ConvGeneralDilated(
+        output, kernel1, {1, dims.stride[1]},
+        /*padding=*/
+        {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/{1, dims.kernel_size[1]},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  }
 
   // Add broadcasts to handle expanding from a size == 1 dimension to a
   // size > 1 dimension.
@@ -214,26 +268,63 @@ xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(xla::XlaBuilder* builder,
   }
   dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims);
   dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1);
-  xla::XlaOp kernel =
-      MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+  xla::XlaOp output;
+  if (dims.kernel_size[0] * dims.kernel_size[1] < kMax2DKernelSize) {
+    xla::XlaOp kernel =
+        MakeBilinearResizeKernel(builder, dims.kernel_size, channels);
+
+    // Broadcast the input kernel where the forward op expanded from a size == 1
+    // dimension to a size > 1 dimension. This has the effect of summing the
+    // gradient contributions in that dimension.
+    for (int i = 0; i < num_spatial_dims; ++i) {
+      if (in_size[i] == 1 && grad_size[i] > 1) {
+        kernel =
+            builder->Add(kernel, builder->ConstantR1<float>(grad_size[i], 0),
+                         /*broadcast_dimensions=*/{i});
+      }
+    }
 
-  // Broadcast the input kernel where the forward op expanded from a size == 1
-  // dimension to a size > 1 dimension. This has the effect of summing the
-  // gradient contributions in that dimension.
-  for (int i = 0; i < num_spatial_dims; ++i) {
-    if (in_size[i] == 1 && grad_size[i] > 1) {
-      kernel = builder->Add(kernel, builder->ConstantR1<float>(grad_size[i], 0),
-                            /*broadcast_dimensions=*/{i});
+    output = builder->ConvGeneralDilated(
+        grad, kernel, /*window_strides=*/dims.kernel_size,
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
+         {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/dims.stride,
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  } else {
+    xla::XlaOp kernel0 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 0);
+    xla::XlaOp kernel1 =
+        MakeBilinearResizeKernelInDim(builder, dims.kernel_size, channels, 1);
+
+    // Broadcast the input kernel where the forward op expanded from a size == 1
+    // dimension to a size > 1 dimension. This has the effect of summing the
+    // gradient contributions in that dimension.
+    if (in_size[0] == 1 && grad_size[0] > 1) {
+      kernel0 =
+          builder->Add(kernel0, builder->ConstantR1<float>(grad_size[0], 0),
+                       /*broadcast_dimensions=*/{0});
+    }
+    if (in_size[1] == 1 && grad_size[1] > 1) {
+      kernel1 =
+          builder->Add(kernel0, builder->ConstantR1<float>(grad_size[1], 0),
+                       /*broadcast_dimensions=*/{1});
     }
-  }
 
-  xla::XlaOp output = builder->ConvGeneralDilated(
-      grad, kernel, /*window_strides=*/dims.kernel_size,
-      /*padding=*/
-      {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1},
-       {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
-      /*lhs_dilation=*/dims.stride,
-      /*rhs_dilation=*/{1, 1}, dimension_numbers);
+    output = builder->ConvGeneralDilated(
+        grad, kernel0, /*window_strides=*/{dims.kernel_size[0], 1},
+        /*padding=*/
+        {{dims.kernel_size[0] - 1, dims.kernel_size[0] - 1}, {0, 0}},
+        /*lhs_dilation=*/{dims.stride[0], 1},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+
+    output = builder->ConvGeneralDilated(
+        output, kernel1, /*window_strides=*/{1, dims.kernel_size[1]},
+        /*padding=*/
+        {{0, 0}, {dims.kernel_size[1] - 1, dims.kernel_size[1] - 1}},
+        /*lhs_dilation=*/{1, dims.stride[1]},
+        /*rhs_dilation=*/{1, 1}, dimension_numbers);
+  }
 
   // If in_size[i] > 1 and grad_size[i] == 1, pad the output in dimension i.
   // Opposite of the slice performed by the forward op.
-- 
GitLab


From 5fa6409cbb7476697acc07bbd35f1a6c1597c845 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 1 Jun 2018 12:02:05 -0700
Subject: [PATCH 176/610] [TF:XLA] Bump open source llvm revision to r333578

PiperOrigin-RevId: 198906281
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 16c1846e17..0672615d5e 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -453,11 +453,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/bf13d093f13a295d71080614c3036ada591201d5.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/bf13d093f13a295d71080614c3036ada591201d5.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/80f62ff390cc9440ef48ccac94ea6f7f51da3b93.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/80f62ff390cc9440ef48ccac94ea6f7f51da3b93.tar.gz",
       ],
-      sha256 = "3c5b4538a4df95090693bf6b758e861afc5b8c599592368f9dc57901f7560bd0",
-      strip_prefix = "llvm-bf13d093f13a295d71080614c3036ada591201d5",
+      sha256 = "119e7d9687a20103088677d5157cf70352392a423943de3cb549f6e4638edc59",
+      strip_prefix = "llvm-80f62ff390cc9440ef48ccac94ea6f7f51da3b93",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From 10b2b3b44a6f93f4fd414e8ac450587ece2207ae Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Fri, 1 Jun 2018 12:20:08 -0700
Subject: [PATCH 177/610] [TF:XLA] Refactor implementation of TruncatedNormal
 to avoid redundant computations. Add an additional test.

PiperOrigin-RevId: 198908904
---
 tensorflow/compiler/tests/random_ops_test.py  |  7 +++
 .../compiler/tf2xla/kernels/random_ops.cc     | 62 +++++++++----------
 2 files changed, 38 insertions(+), 31 deletions(-)

diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index d6c93088d4..70be22936a 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -76,6 +76,13 @@ class RandomOpsTest(XLATestCase):
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
+  def testTruncatedNormalIsNotConstant(self):
+    def rng(dtype):
+      return random_ops.truncated_normal(shape=[2], dtype=dtype)
+
+    # TODO(b/34339814): implement inverse erf support for non-F32 types.
+    self._testRngIsNotConstant(rng, dtypes.float32)
+
   def testTruncatedNormalIsInRange(self):
     count = 10000
     # TODO(b/34339814): implement inverse erf support for non-F32 types.
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 5f5bd58637..39149d56ad 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 // TODO(misard,phawkins): handle random number generator seeds/states correctly.
 // TODO(misard,phawkins): add tests.
 
+#include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -127,13 +128,8 @@ class TruncatedNormalOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsShape(0, &shape));
     xla::Shape xla_shape;
     OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(dtype, shape, &xla_shape));
-    xla::Shape xla_element_shape =
-        xla::ShapeUtil::MakeShape(xla_shape.element_type(), {});
 
     xla::XlaBuilder* b = ctx->builder();
-    xla::XlaOp mean = XlaHelpers::Zero(b, dtype);
-    xla::XlaOp stddev = XlaHelpers::One(b, dtype);
-    xla::XlaOp candidate = b->RngNormal(mean, stddev, xla_shape);
 
     auto two_sd = [dtype](bool negate, xla::XlaBuilder* b) {
       return XlaHelpers::FloatLiteral(b, dtype, negate ? -2.0 : 2.0);
@@ -151,34 +147,38 @@ class TruncatedNormalOp : public XlaOpKernel {
     //   out_of_range_mask := candidate < mean-2*sd || candidate > mean+2*sd
     //   candidate = select(out_of_range_mask, rng_normal(), candidate)
     // }
-    std::unique_ptr<xla::XlaBuilder> test_builder =
-        b->CreateSubBuilder("truncated_normal_test");
-    {
-      auto* b = test_builder.get();
-      xla::XlaOp candidate = b->Parameter(0, xla_shape, "candidate");
-      out_of_range_mask(candidate, b);
-      OP_REQUIRES_OK(ctx, Any(out_of_range_mask(candidate, b), b).status());
-    }
-
-    std::unique_ptr<xla::XlaBuilder> body_builder =
-        b->CreateSubBuilder("truncated_normal_body");
-    {
-      auto* b = body_builder.get();
-      xla::XlaOp candidate = b->Parameter(0, xla_shape, "candidate");
-      xla::XlaOp to_resample = out_of_range_mask(candidate, b);
+    std::vector<xla::XlaOp> initial_values = {
+        // The current candidate.
+        b->Broadcast(XlaHelpers::Zero(b, dtype), shape.dim_sizes()),
+        // The to_resample mask, where 'true' identifies a location in the
+        // current candidate that is out of range and must be regenerated.
+        b->Broadcast(b->ConstantR0<bool>(true), shape.dim_sizes()),
+        // Is any element in the mask true?
+        b->ConstantR0<bool>(true)};
+    auto condition = [&](gtl::ArraySlice<xla::XlaOp> values,
+                         xla::XlaBuilder* b) -> xla::StatusOr<xla::XlaOp> {
+      // Continue while any element in the mask is true.
+      return values[2];
+    };
+    auto body =
+        [&](gtl::ArraySlice<xla::XlaOp> values,
+            xla::XlaBuilder* b) -> xla::StatusOr<std::vector<xla::XlaOp>> {
+      xla::XlaOp candidate = values[0];
+      xla::XlaOp to_resample = values[1];
       xla::XlaOp mean = XlaHelpers::Zero(b, dtype);
       xla::XlaOp stddev = XlaHelpers::One(b, dtype);
-      b->Select(to_resample, b->RngNormal(mean, stddev, xla_shape), candidate);
-    }
-
-    xla::StatusOr<xla::XlaComputation> test_computation = test_builder->Build();
-    OP_REQUIRES_OK(ctx, test_computation.status());
-    xla::StatusOr<xla::XlaComputation> body_computation = body_builder->Build();
-    OP_REQUIRES_OK(ctx, body_computation.status());
-    xla::XlaOp result = b->While(test_computation.ValueOrDie(),
-                                 body_computation.ValueOrDie(), candidate);
-
-    ctx->SetOutput(0, result);
+      candidate = b->Select(to_resample, b->RngNormal(mean, stddev, xla_shape),
+                            candidate);
+      // Compute a new to_resample mask, and determine whether any value is
+      // still out of range.
+      to_resample = out_of_range_mask(candidate, b);
+      TF_ASSIGN_OR_RETURN(xla::XlaOp done, Any(to_resample, b));
+      return std::vector<xla::XlaOp>{candidate, to_resample, done};
+    };
+    auto result =
+        XlaWhileLoop(condition, body, initial_values, "truncated_normal", b);
+    OP_REQUIRES_OK(ctx, result.status());
+    ctx->SetOutput(0, result.ValueOrDie()[0]);
   }
 };
 
-- 
GitLab


From eebb9e0449b38703869ae7ccd0aa2c649f9f5aaf Mon Sep 17 00:00:00 2001
From: Clayne Robison <clayne.b.robison@intel.com>
Date: Fri, 1 Jun 2018 12:29:39 -0700
Subject: [PATCH 178/610] Finished incomplete support for bad usernames in the
 CI build scripts. ci_build.sh now passes the environment variable to the
 container, and the with_the_same_user script adds the --force-badname param
 to addgroup as well. (#19699)

---
 tensorflow/tools/ci_build/builds/with_the_same_user | 2 +-
 tensorflow/tools/ci_build/ci_build.sh               | 7 +++++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/builds/with_the_same_user b/tensorflow/tools/ci_build/builds/with_the_same_user
index d4bf546d40..b216e3549f 100755
--- a/tensorflow/tools/ci_build/builds/with_the_same_user
+++ b/tensorflow/tools/ci_build/builds/with_the_same_user
@@ -40,7 +40,7 @@ if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
   ADDUSER_OPTS="--force-badname"
 fi
 
-getent group "${CI_BUILD_GID}" || addgroup --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
+getent group "${CI_BUILD_GID}" || addgroup ${ADDUSER_OPTS} --gid "${CI_BUILD_GID}" "${CI_BUILD_GROUP}"
 getent passwd "${CI_BUILD_UID}" || adduser ${ADDUSER_OPTS} \
     --gid "${CI_BUILD_GID}" --uid "${CI_BUILD_UID}" \
     --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \
diff --git a/tensorflow/tools/ci_build/ci_build.sh b/tensorflow/tools/ci_build/ci_build.sh
index 072dd6ab99..1f0fd0387a 100755
--- a/tensorflow/tools/ci_build/ci_build.sh
+++ b/tensorflow/tools/ci_build/ci_build.sh
@@ -134,6 +134,12 @@ if [[ $? != "0" ]]; then
   die "ERROR: docker build failed. Dockerfile is at ${DOCKERFILE_PATH}"
 fi
 
+# If caller wants the with_the_same_user script to allow bad usernames, 
+# pass the var to the docker environment
+if [ -n "${CI_BUILD_USER_FORCE_BADNAME}" ]; then
+        CI_BUILD_USER_FORCE_BADNAME_ENV="-e CI_BUILD_USER_FORCE_BADNAME=yes"
+fi
+
 # Run the command inside the container.
 echo "Running '${COMMAND[*]}' inside ${DOCKER_IMG_NAME}..."
 mkdir -p ${WORKSPACE}/bazel-ci_build-cache
@@ -148,6 +154,7 @@ ${DOCKER_BINARY} run --rm --pid=host \
     -e "CI_BUILD_GROUP=$(id -g -n)" \
     -e "CI_BUILD_GID=$(id -g)" \
     -e "CI_TENSORFLOW_SUBMODULE_PATH=${CI_TENSORFLOW_SUBMODULE_PATH}" \
+    ${CI_BUILD_USER_FORCE_BADNAME_ENV} \
     -v ${WORKSPACE}:/workspace \
     -w /workspace \
     ${GPU_EXTRA_PARAMS} \
-- 
GitLab


From b812f37e26889bb168fa0279a536b907c3fb5fdd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 12:53:54 -0700
Subject: [PATCH 179/610] TFLite: adding tile and expand_dims ops.

PiperOrigin-RevId: 198913026
---
 tensorflow/contrib/lite/build_def.bzl         |   2 +
 tensorflow/contrib/lite/builtin_ops.h         |   2 +
 tensorflow/contrib/lite/kernels/BUILD         |  31 +++
 .../contrib/lite/kernels/expand_dims.cc       | 113 ++++++++
 .../contrib/lite/kernels/expand_dims_test.cc  |  83 ++++++
 tensorflow/contrib/lite/kernels/register.cc   |   4 +
 tensorflow/contrib/lite/kernels/tile.cc       | 194 +++++++++++++
 tensorflow/contrib/lite/kernels/tile_test.cc  | 256 ++++++++++++++++++
 tensorflow/contrib/lite/model.cc              |   4 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   2 +
 tensorflow/contrib/lite/schema/schema.fbs     |  10 +
 .../contrib/lite/schema/schema_generated.h    | 236 +++++++++++++++-
 .../contrib/lite/testing/generate_examples.py |  67 +++++
 .../contrib/lite/toco/tflite/operator.cc      |  38 +++
 14 files changed, 1036 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/contrib/lite/kernels/expand_dims.cc
 create mode 100644 tensorflow/contrib/lite/kernels/expand_dims_test.cc
 create mode 100644 tensorflow/contrib/lite/kernels/tile.cc
 create mode 100644 tensorflow/contrib/lite/kernels/tile_test.cc

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index b9e40cc50c..aa6a60dc9e 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -205,6 +205,7 @@ def generated_test_models():
         "depthwiseconv",
         "div",
         "exp",
+        "expand_dims",
         "floor",
         "fully_connected",
         "fused_batch_norm",
@@ -245,6 +246,7 @@ def generated_test_models():
         "strided_slice",
         "strided_slice_1d_exhaustive",
         "sub",
+        "tile",
         "topk",
         "transpose",
         "transpose_conv",
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index c797e3589a..fc6fdd6eef 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -94,6 +94,8 @@ typedef enum {
   kTfLiteBuiltinSin = 66,
   kTfLiteBuiltinTransposeConv = 67,
   kTfLiteBuiltinSparseToDense = 68,
+  kTfLiteBuiltinTile = 69,
+  kTfLiteBuiltinExpandDims = 70,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/kernels/BUILD b/tensorflow/contrib/lite/kernels/BUILD
index 0af659b5ca..cf5d0b4ce9 100644
--- a/tensorflow/contrib/lite/kernels/BUILD
+++ b/tensorflow/contrib/lite/kernels/BUILD
@@ -147,6 +147,7 @@ cc_library(
         "embedding_lookup.cc",
         "embedding_lookup_sparse.cc",
         "exp.cc",
+        "expand_dims.cc",
         "floor.cc",
         "fully_connected.cc",
         "gather.cc",
@@ -176,6 +177,7 @@ cc_library(
         "strided_slice.cc",
         "sub.cc",
         "svdf.cc",
+        "tile.cc",
         "topk_v2.cc",
         "transpose.cc",
         "transpose_conv.cc",
@@ -858,6 +860,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "tile_test",
+    size = "small",
+    srcs = ["tile_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "comparisons_test",
     size = "small",
@@ -935,6 +951,20 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "expand_dims_test",
+    size = "small",
+    srcs = ["expand_dims_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 tf_cc_test(
     name = "sparse_to_dense_test",
     size = "small",
@@ -942,6 +972,7 @@ tf_cc_test(
     tags = ["tflite_not_portable_ios"],
     deps = [
         ":builtin_ops",
+        "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:test_util",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/contrib/lite/kernels/expand_dims.cc b/tensorflow/contrib/lite/kernels/expand_dims.cc
new file mode 100644
index 0000000000..ed33012864
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/expand_dims.cc
@@ -0,0 +1,113 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace expand_dims {
+constexpr int kInput = 0;
+constexpr int kAxis = 1;
+constexpr int kOutput = 0;
+
+namespace {
+TfLiteStatus ExpandTensorDim(TfLiteContext* context, const TfLiteTensor& input,
+                             int axis, TfLiteTensor* output) {
+  const TfLiteIntArray& input_dims = *input.dims;
+  if (axis < 0) {
+    axis = input_dims.size + 1 + axis;
+  }
+  TF_LITE_ENSURE(context, axis <= input_dims.size);
+
+  TfLiteIntArray* output_dims = TfLiteIntArrayCreate(input_dims.size + 1);
+  for (int i = 0; i < output_dims->size; ++i) {
+    if (i < axis) {
+      output_dims->data[i] = input_dims.data[i];
+    } else if (i == axis) {
+      output_dims->data[i] = 1;
+    } else {
+      output_dims->data[i] = input_dims.data[i - 1];
+    }
+  }
+
+  return context->ResizeTensor(context, output, output_dims);
+}
+
+TfLiteStatus GetAxisValueFromTensor(TfLiteContext* context,
+                                    const TfLiteTensor& axis, int* axis_value) {
+  TF_LITE_ENSURE_EQ(context, NumElements(&axis), 1);
+  switch (axis.type) {
+    case kTfLiteInt32:
+      *axis_value = *GetTensorData<int32_t>(&axis);
+      return kTfLiteOk;
+    case kTfLiteInt64:
+      *axis_value = *GetTensorData<int64_t>(&axis);
+      return kTfLiteOk;
+    default:
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  output->type = input->type;
+  if (IsConstantTensor(axis)) {
+    int axis_value;
+    TF_LITE_ENSURE_OK(context,
+                      GetAxisValueFromTensor(context, *axis, &axis_value));
+    return ExpandTensorDim(context, *input, axis_value, output);
+  }
+  SetTensorToDynamic(output);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  // Just copy input to output.
+  const TfLiteTensor* input = GetInput(context, node, kInput);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const TfLiteTensor* axis = GetInput(context, node, kAxis);
+  if (IsDynamicTensor(output)) {
+    int axis_value;
+    TF_LITE_ENSURE_OK(context,
+                      GetAxisValueFromTensor(context, *axis, &axis_value));
+    TF_LITE_ENSURE_OK(context,
+                      ExpandTensorDim(context, *input, axis_value, output));
+  }
+  memcpy(output->data.raw, input->data.raw, input->bytes);
+  return kTfLiteOk;
+}
+
+}  // namespace expand_dims
+TfLiteRegistration* Register_EXPAND_DIMS() {
+  static TfLiteRegistration r = {nullptr, nullptr, expand_dims::Prepare,
+                                 expand_dims::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/expand_dims_test.cc b/tensorflow/contrib/lite/kernels/expand_dims_test.cc
new file mode 100644
index 0000000000..b755e8ce29
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/expand_dims_test.cc
@@ -0,0 +1,83 @@
+
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class ExpandDimsOpModel : public SingleOpModel {
+ public:
+  ExpandDimsOpModel(std::initializer_list<int> input_shape,
+                    TensorType input_type) {
+    input_ = AddInput(input_type);
+    axis_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_EXPAND_DIMS, BuiltinOptions_ExpandDimsOptions,
+                 0);
+    BuildInterpreter({input_shape, {1}});
+  }
+  void SetInputFloat(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+  void SetAxis(int axis) { PopulateTensor<int32>(axis_, {axis}); }
+  std::vector<float> GetValuesFloat() { return ExtractVector<float>(output_); }
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int axis_;
+  int output_;
+};
+
+TEST(ExpandDimsOpTest, DifferentAxis) {
+  ExpandDimsOpModel m({2, 2}, TensorType_FLOAT32);
+  const auto values = {-1.f, 1.f, -2.f, 2.f};
+  m.SetInputFloat(values);
+  m.SetAxis(0);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 2}));
+
+  m.SetAxis(1);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2}));
+
+  m.SetAxis(2);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1}));
+
+  m.SetAxis(-1);
+  m.Invoke();
+  EXPECT_THAT(m.GetValuesFloat(), ElementsAreArray(values));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2, 1}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 4eea9921b2..c7d72738d6 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -85,11 +85,13 @@ TfLiteRegistration* Register_GREATER_EQUAL();
 TfLiteRegistration* Register_LESS();
 TfLiteRegistration* Register_LESS_EQUAL();
 TfLiteRegistration* Register_FLOOR();
+TfLiteRegistration* Register_TILE();
 TfLiteRegistration* Register_NEG();
 TfLiteRegistration* Register_SELECT();
 TfLiteRegistration* Register_SLICE();
 TfLiteRegistration* Register_SIN();
 TfLiteRegistration* Register_TRANSPOSE_CONV();
+TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
 
 BuiltinOpResolver::BuiltinOpResolver() {
@@ -162,6 +164,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_SLICE, Register_SLICE());
   AddBuiltin(BuiltinOperator_SIN, Register_SIN());
   AddBuiltin(BuiltinOperator_TRANSPOSE_CONV, Register_TRANSPOSE_CONV());
+  AddBuiltin(BuiltinOperator_TILE, Register_TILE());
+  AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
   AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
diff --git a/tensorflow/contrib/lite/kernels/tile.cc b/tensorflow/contrib/lite/kernels/tile.cc
new file mode 100644
index 0000000000..af77f07474
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/tile.cc
@@ -0,0 +1,194 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <string.h>
+#include <vector>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/kernels/op_macros.h"
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace tile {
+
+constexpr int kInputTensor = 0;
+constexpr int kInputMultipliers = 1;
+constexpr int kOutputTensor = 0;
+
+namespace {
+template <typename T>
+TfLiteIntArray* MultiplyShapeDims(const TfLiteIntArray& shape,
+                                  const TfLiteTensor* multipliers,
+                                  int num_dimensions) {
+  const T* multipliers_v = GetTensorData<T>(multipliers);
+
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(num_dimensions);
+  for (int i = 0; i < num_dimensions; ++i) {
+    output_shape->data[i] = shape.data[i] * multipliers_v[i];
+  }
+  return output_shape;
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+
+  const int num_dimensions = NumDimensions(input);
+  const int num_multipliers = NumElements(multipliers);
+  TF_LITE_ENSURE_EQ(context, num_dimensions, num_multipliers);
+  switch (multipliers->type) {
+    case kTfLiteInt32:
+      return context->ResizeTensor(
+          context, output,
+          MultiplyShapeDims<int32_t>(*input->dims, multipliers,
+                                     num_dimensions));
+    case kTfLiteInt64:
+      return context->ResizeTensor(
+          context, output,
+          MultiplyShapeDims<int64_t>(*input->dims, multipliers,
+                                     num_dimensions));
+    default:
+      context->ReportError(context, "Tile not supported multiply tensor type.");
+      return kTfLiteError;
+  }
+}
+
+template <typename T>
+void CopyMultipleTimes(const T* in_data, int32_t in_size, int32_t multiplier,
+                       T* out_data) {
+  for (int i = 0; i < multiplier; ++i) {
+    const T* in_end = in_data + in_size;
+    T* new_out_data = std::copy(in_data, in_end, out_data);
+    in_data = out_data;
+    out_data = new_out_data;
+  }
+}
+
+template <typename T, typename M>
+std::pair<int, int> TileOneDimension(const TfLiteIntArray& in_dimensions,
+                                     const T* in_data, const M* multipliers,
+                                     T* out_data, int dimension) {
+  const int dimension_size = in_dimensions.data[dimension];
+  if (dimension == in_dimensions.size - 1) {
+    CopyMultipleTimes(in_data, dimension_size, multipliers[dimension],
+                      out_data);
+    return std::make_pair(dimension_size,
+                          dimension_size * multipliers[dimension]);
+  }
+  int total_stride_size = 0, total_tiled_stride_size = 0;
+  const T* copy_from_data = in_data;
+  T* copy_to_data = out_data;
+  for (int i = 0; i < dimension_size; ++i) {
+    int stride_size = 0, tiled_stride_size = 0;
+    std::tie(stride_size, tiled_stride_size) =
+        TileOneDimension(in_dimensions, copy_from_data, multipliers,
+                         copy_to_data, dimension + 1);
+    copy_from_data += stride_size;
+    copy_to_data += tiled_stride_size;
+    total_stride_size += stride_size;
+    total_tiled_stride_size += tiled_stride_size;
+  }
+  CopyMultipleTimes(out_data, total_tiled_stride_size,
+                    multipliers[dimension] - 1,
+                    out_data + total_tiled_stride_size);
+  return std::make_pair(total_stride_size,
+                        total_tiled_stride_size * multipliers[dimension]);
+}
+
+template <typename T>
+void Tile(const TfLiteIntArray& in_dimensions, const TfLiteTensor* in_data,
+          const TfLiteTensor* multipliers, TfLiteTensor* out_data) {
+  // Doing recursively tiling from top to down dimension.
+  switch (multipliers->type) {
+    case kTfLiteInt32:
+      TileOneDimension(in_dimensions, GetTensorData<T>(in_data),
+                       GetTensorData<int32_t>(multipliers),
+                       GetTensorData<T>(out_data), 0);
+      break;
+    case kTfLiteInt64:
+      TileOneDimension(in_dimensions, GetTensorData<T>(in_data),
+                       GetTensorData<int64_t>(multipliers),
+                       GetTensorData<T>(out_data), 0);
+      break;
+    default:
+      break;
+  }
+}
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, output->type);
+
+  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+  // Only int32 and int64 multipliers type is supported.
+  TF_LITE_ENSURE_MSG(context,
+                     (multipliers->type == kTfLiteInt32) ||
+                         (multipliers->type == kTfLiteInt64),
+                     "Tile only supports int32 and int64 mutlipliers.");
+
+  if (IsConstantTensor(multipliers)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
+  } else {
+    SetTensorToDynamic(output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  const TfLiteTensor* multipliers = GetInput(context, node, kInputMultipliers);
+
+  if (IsDynamicTensor(output)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, node));
+  }
+
+  switch (output->type) {
+    case kTfLiteFloat32:
+      Tile<float>(*(input->dims), input, multipliers, output);
+      break;
+    case kTfLiteUInt8:
+      Tile<uint8_t>(*(input->dims), input, multipliers, output);
+      break;
+    case kTfLiteInt32:
+      Tile<int32_t>(*(input->dims), input, multipliers, output);
+      break;
+    case kTfLiteInt64:
+      Tile<int64_t>(*(input->dims), input, multipliers, output);
+      break;
+    default:
+      context->ReportError(context, "Type is currently not supported by Tile.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+}  // namespace tile
+TfLiteRegistration* Register_TILE() {
+  static TfLiteRegistration r = {nullptr, nullptr, tile::Prepare, tile::Eval};
+  return &r;
+}
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/tile_test.cc b/tensorflow/contrib/lite/kernels/tile_test.cc
new file mode 100644
index 0000000000..a134a75d56
--- /dev/null
+++ b/tensorflow/contrib/lite/kernels/tile_test.cc
@@ -0,0 +1,256 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+class TileOpModel : public SingleOpModel {
+ public:
+  TileOpModel(std::initializer_list<int> input_shape, TensorType input_type,
+              TensorType multiply_type) {
+    input_ = AddInput(input_type);
+    multipliers_ = AddInput(TensorType_INT32);
+    output_ = AddOutput(input_type);
+    SetBuiltinOp(BuiltinOperator_TILE, BuiltinOptions_TileOptions, 0);
+    BuildInterpreter({input_shape, {static_cast<int>(input_shape.size())}});
+  }
+
+  void SetInputFloat(std::initializer_list<float> data) {
+    PopulateTensor<float>(input_, data);
+  }
+
+  void SetInputUInt8(std::initializer_list<uint8> data) {
+    PopulateTensor<uint8>(input_, data);
+  }
+
+  void SetInputInt32(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(input_, data);
+  }
+
+  void SetInputInt64(std::initializer_list<int64_t> data) {
+    PopulateTensor<int64_t>(input_, data);
+  }
+
+  void SetMultipliers(std::initializer_list<int32> data) {
+    PopulateTensor<int32>(multipliers_, data);
+  }
+
+  std::vector<float> GetOutputFloat() { return ExtractVector<float>(output_); }
+
+  std::vector<uint8> GetOutputUInt8() { return ExtractVector<uint8>(output_); }
+
+  std::vector<int32> GetOutputInt32() { return ExtractVector<int32>(output_); }
+
+  std::vector<int64_t> GetOutputInt64() {
+    return ExtractVector<int64_t>(output_);
+  }
+
+  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+
+ protected:
+  int input_;
+  int multipliers_;
+  int output_;
+};
+
+TEST(TileTest, Float32Vector) {
+  TileOpModel m({3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInputFloat({1.f, 2.f, 3.f});
+  m.SetMultipliers({2});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(),
+              ElementsAreArray({1.f, 2.f, 3.f, 1.f, 2.f, 3.f}));
+}
+
+TEST(TileTest, Float32Matrix) {
+  TileOpModel m({2, 3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInputFloat({
+      11.f,
+      12.f,
+      13.f,
+      21.f,
+      22.f,
+      23.f,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputFloat(), ElementsAreArray({
+                                      11.f,
+                                      12.f,
+                                      13.f,
+                                      21.f,
+                                      22.f,
+                                      23.f,
+                                      11.f,
+                                      12.f,
+                                      13.f,
+                                      21.f,
+                                      22.f,
+                                      23.f,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Float32HighDimension) {
+  TileOpModel m({1, 2, 3}, TensorType_FLOAT32, TensorType_INT32);
+  m.SetInputFloat({
+      11.f,
+      12.f,
+      13.f,
+      21.f,
+      22.f,
+      23.f,
+  });
+  m.SetMultipliers({2, 3, 1});
+  m.Invoke();
+  EXPECT_THAT(
+      m.GetOutputFloat(),
+      ElementsAreArray({11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
+                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f,
+                        11.f, 12.f, 13.f, 21.f, 22.f, 23.f, 11.f, 12.f, 13.f,
+                        21.f, 22.f, 23.f, 11.f, 12.f, 13.f, 21.f, 22.f, 23.f}));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 6, 3}));
+}
+
+TEST(TileTest, Uint8Matrix) {
+  TileOpModel m({2, 3}, TensorType_UINT8, TensorType_INT32);
+  m.SetInputUInt8({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputUInt8(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int32Matrix) {
+  TileOpModel m({2, 3}, TensorType_INT32, TensorType_INT32);
+  m.SetInputInt32({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputInt32(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int64Matrix) {
+  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT32);
+  m.SetInputInt64({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+
+TEST(TileTest, Int64Matrix64Multipliers) {
+  TileOpModel m({2, 3}, TensorType_INT64, TensorType_INT64);
+  m.SetInputInt64({
+      11,
+      12,
+      13,
+      21,
+      22,
+      23,
+  });
+  m.SetMultipliers({2, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutputInt64(), ElementsAreArray({
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                      11,
+                                      12,
+                                      13,
+                                      21,
+                                      22,
+                                      23,
+                                  }));
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({4, 3}));
+}
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 6ac41a94bd..ca115a1c59 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -714,6 +714,10 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
       return kTfLiteError;
     }
+    case BuiltinOperator_EXPAND_DIMS:
+    case BuiltinOperator_TILE: {
+      break;
+    }
   }
   return kTfLiteOk;
 }
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index fad08bbfe6..d27ab0c033 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -491,6 +491,8 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SLICE:
       case tflite::BuiltinOperator_SIN:
       case tflite::BuiltinOperator_TRANSPOSE_CONV:
+      case tflite::BuiltinOperator_TILE:
+      case tflite::BuiltinOperator_EXPAND_DIMS:
       case tflite::BuiltinOperator_SPARSE_TO_DENSE:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 522eac25b3..7d76134e3d 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -146,6 +146,8 @@ enum BuiltinOperator : byte {
   SIN = 66,
   TRANSPOSE_CONV = 67,
   SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
 }
 
 // Options for the builtin operators.
@@ -200,6 +202,8 @@ union BuiltinOptions {
   SliceOptions,
   TransposeConvOptions,
   SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -421,6 +425,9 @@ table DequantizeOptions {
 table MaximumMinimumOptions {
 }
 
+table TileOptions {
+}
+
 table ArgMaxOptions {
   output_type : TensorType;
 }
@@ -452,6 +459,9 @@ table TransposeConvOptions {
   stride_h:int;
 }
 
+table ExpandDimsOptions {
+}
+
 table SparseToDenseOptions {
   validate_indices:bool;
 }
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 746dd26796..0a60fcd3d0 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -151,6 +151,9 @@ struct DequantizeOptionsT;
 struct MaximumMinimumOptions;
 struct MaximumMinimumOptionsT;
 
+struct TileOptions;
+struct TileOptionsT;
+
 struct ArgMaxOptions;
 struct ArgMaxOptionsT;
 
@@ -178,6 +181,9 @@ struct SliceOptionsT;
 struct TransposeConvOptions;
 struct TransposeConvOptionsT;
 
+struct ExpandDimsOptions;
+struct ExpandDimsOptionsT;
+
 struct SparseToDenseOptions;
 struct SparseToDenseOptionsT;
 
@@ -309,11 +315,13 @@ enum BuiltinOperator {
   BuiltinOperator_SIN = 66,
   BuiltinOperator_TRANSPOSE_CONV = 67,
   BuiltinOperator_SPARSE_TO_DENSE = 68,
+  BuiltinOperator_TILE = 69,
+  BuiltinOperator_EXPAND_DIMS = 70,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_SPARSE_TO_DENSE
+  BuiltinOperator_MAX = BuiltinOperator_EXPAND_DIMS
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[68] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[70] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -382,7 +390,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[68] {
     BuiltinOperator_SLICE,
     BuiltinOperator_SIN,
     BuiltinOperator_TRANSPOSE_CONV,
-    BuiltinOperator_SPARSE_TO_DENSE
+    BuiltinOperator_SPARSE_TO_DENSE,
+    BuiltinOperator_TILE,
+    BuiltinOperator_EXPAND_DIMS
   };
   return values;
 }
@@ -458,6 +468,8 @@ inline const char **EnumNamesBuiltinOperator() {
     "SIN",
     "TRANSPOSE_CONV",
     "SPARSE_TO_DENSE",
+    "TILE",
+    "EXPAND_DIMS",
     nullptr
   };
   return names;
@@ -520,11 +532,13 @@ enum BuiltinOptions {
   BuiltinOptions_SliceOptions = 48,
   BuiltinOptions_TransposeConvOptions = 49,
   BuiltinOptions_SparseToDenseOptions = 50,
+  BuiltinOptions_TileOptions = 51,
+  BuiltinOptions_ExpandDimsOptions = 52,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_SparseToDenseOptions
+  BuiltinOptions_MAX = BuiltinOptions_ExpandDimsOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[51] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[53] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -576,7 +590,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[51] {
     BuiltinOptions_SelectOptions,
     BuiltinOptions_SliceOptions,
     BuiltinOptions_TransposeConvOptions,
-    BuiltinOptions_SparseToDenseOptions
+    BuiltinOptions_SparseToDenseOptions,
+    BuiltinOptions_TileOptions,
+    BuiltinOptions_ExpandDimsOptions
   };
   return values;
 }
@@ -634,6 +650,8 @@ inline const char **EnumNamesBuiltinOptions() {
     "SliceOptions",
     "TransposeConvOptions",
     "SparseToDenseOptions",
+    "TileOptions",
+    "ExpandDimsOptions",
     nullptr
   };
   return names;
@@ -848,6 +866,14 @@ template<> struct BuiltinOptionsTraits<SparseToDenseOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_SparseToDenseOptions;
 };
 
+template<> struct BuiltinOptionsTraits<TileOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_TileOptions;
+};
+
+template<> struct BuiltinOptionsTraits<ExpandDimsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_ExpandDimsOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1279,6 +1305,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_SparseToDenseOptions ?
       reinterpret_cast<const SparseToDenseOptionsT *>(value) : nullptr;
   }
+  TileOptionsT *AsTileOptions() {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<TileOptionsT *>(value) : nullptr;
+  }
+  const TileOptionsT *AsTileOptions() const {
+    return type == BuiltinOptions_TileOptions ?
+      reinterpret_cast<const TileOptionsT *>(value) : nullptr;
+  }
+  ExpandDimsOptionsT *AsExpandDimsOptions() {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<ExpandDimsOptionsT *>(value) : nullptr;
+  }
+  const ExpandDimsOptionsT *AsExpandDimsOptions() const {
+    return type == BuiltinOptions_ExpandDimsOptions ?
+      reinterpret_cast<const ExpandDimsOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -4152,6 +4194,46 @@ inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(
 
 flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(flatbuffers::FlatBufferBuilder &_fbb, const MaximumMinimumOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct TileOptionsT : public flatbuffers::NativeTable {
+  typedef TileOptions TableType;
+  TileOptionsT() {
+  }
+};
+
+struct TileOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef TileOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  TileOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<TileOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct TileOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit TileOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  TileOptionsBuilder &operator=(const TileOptionsBuilder &);
+  flatbuffers::Offset<TileOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<TileOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<TileOptions> CreateTileOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  TileOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct ArgMaxOptionsT : public flatbuffers::NativeTable {
   typedef ArgMaxOptions TableType;
   TensorType output_type;
@@ -4564,6 +4646,46 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(
 
 flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flatbuffers::FlatBufferBuilder &_fbb, const TransposeConvOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ExpandDimsOptionsT : public flatbuffers::NativeTable {
+  typedef ExpandDimsOptions TableType;
+  ExpandDimsOptionsT() {
+  }
+};
+
+struct ExpandDimsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef ExpandDimsOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  ExpandDimsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<ExpandDimsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ExpandDimsOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit ExpandDimsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ExpandDimsOptionsBuilder &operator=(const ExpandDimsOptionsBuilder &);
+  flatbuffers::Offset<ExpandDimsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<ExpandDimsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  ExpandDimsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct SparseToDenseOptionsT : public flatbuffers::NativeTable {
   typedef SparseToDenseOptions TableType;
   bool validate_indices;
@@ -4899,6 +5021,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const SparseToDenseOptions *builtin_options_as_SparseToDenseOptions() const {
     return builtin_options_type() == BuiltinOptions_SparseToDenseOptions ? static_cast<const SparseToDenseOptions *>(builtin_options()) : nullptr;
   }
+  const TileOptions *builtin_options_as_TileOptions() const {
+    return builtin_options_type() == BuiltinOptions_TileOptions ? static_cast<const TileOptions *>(builtin_options()) : nullptr;
+  }
+  const ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const {
+    return builtin_options_type() == BuiltinOptions_ExpandDimsOptions ? static_cast<const ExpandDimsOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -5125,6 +5253,14 @@ template<> inline const SparseToDenseOptions *Operator::builtin_options_as<Spars
   return builtin_options_as_SparseToDenseOptions();
 }
 
+template<> inline const TileOptions *Operator::builtin_options_as<TileOptions>() const {
+  return builtin_options_as_TileOptions();
+}
+
+template<> inline const ExpandDimsOptions *Operator::builtin_options_as<ExpandDimsOptions>() const {
+  return builtin_options_as_ExpandDimsOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -6725,6 +6861,29 @@ inline flatbuffers::Offset<MaximumMinimumOptions> CreateMaximumMinimumOptions(fl
       _fbb);
 }
 
+inline TileOptionsT *TileOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new TileOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void TileOptions::UnPackTo(TileOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<TileOptions> TileOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateTileOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<TileOptions> CreateTileOptions(flatbuffers::FlatBufferBuilder &_fbb, const TileOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const TileOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateTileOptions(
+      _fbb);
+}
+
 inline ArgMaxOptionsT *ArgMaxOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new ArgMaxOptionsT();
   UnPackTo(_o, _resolver);
@@ -6944,6 +7103,29 @@ inline flatbuffers::Offset<TransposeConvOptions> CreateTransposeConvOptions(flat
       _stride_h);
 }
 
+inline ExpandDimsOptionsT *ExpandDimsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new ExpandDimsOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void ExpandDimsOptions::UnPackTo(ExpandDimsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<ExpandDimsOptions> ExpandDimsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateExpandDimsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<ExpandDimsOptions> CreateExpandDimsOptions(flatbuffers::FlatBufferBuilder &_fbb, const ExpandDimsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const ExpandDimsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateExpandDimsOptions(
+      _fbb);
+}
+
 inline SparseToDenseOptionsT *SparseToDenseOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new SparseToDenseOptionsT();
   UnPackTo(_o, _resolver);
@@ -7356,6 +7538,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const SparseToDenseOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const TileOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const ExpandDimsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -7574,6 +7764,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const SparseToDenseOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const TileOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const ExpandDimsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -7780,6 +7978,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const SparseToDenseOptionsT *>(value);
       return CreateSparseToDenseOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<const TileOptionsT *>(value);
+      return CreateTileOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<const ExpandDimsOptionsT *>(value);
+      return CreateExpandDimsOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -7986,6 +8192,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new SparseToDenseOptionsT(*reinterpret_cast<SparseToDenseOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_TileOptions: {
+      value = new TileOptionsT(*reinterpret_cast<TileOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      value = new ExpandDimsOptionsT(*reinterpret_cast<ExpandDimsOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -8243,6 +8457,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_TileOptions: {
+      auto ptr = reinterpret_cast<TileOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_ExpandDimsOptions: {
+      auto ptr = reinterpret_cast<ExpandDimsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 6a6d12ed67..f07e36fc7d 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -2517,6 +2517,72 @@ def make_transpose_conv_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_tile_tests(zip_path):
+  """Make a set of tests to do tile."""
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32],
+      "input_shape": [[3, 2, 1], [2, 2, 2]],
+      "multiplier_dtype": [tf.int32, tf.int64],
+      "multiplier_shape": [[3]]
+  }]
+
+  def build_graph(parameters):
+    """Build the tile op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        shape=parameters["input_shape"],
+        name="input")
+    multiplier_value = tf.placeholder(
+        dtype=parameters["multiplier_dtype"],
+        shape=parameters["multiplier_shape"],
+        name="multiplier")
+    out = tf.tile(input_value, multiplier_value)
+    return [input_value, multiplier_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_dtype"],
+                                     parameters["input_shape"])
+    multipliers_value = create_tensor_data(parameters["multiplier_dtype"],
+                                           parameters["multiplier_shape"])
+    return [input_value, multipliers_value], sess.run(
+        outputs,
+        feed_dict={
+            inputs[0]: input_value,
+            inputs[1]: multipliers_value
+        })
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_expand_dims_tests(zip_path):
+  """Make a set of tests to do expand_dims."""
+
+  test_parameters = [{
+      "input_type": [tf.float32, tf.int32],
+      "input_shape": [[3, 4], [10, 10, 3]],
+      "axis_value": [0, 1, 2, -1, -2],
+  }]
+
+  def build_graph(parameters):
+    """Build the where op testing graph."""
+    input_value = tf.placeholder(
+        dtype=parameters["input_type"],
+        name="input",
+        shape=parameters["input_shape"])
+    axis_value = tf.placeholder(dtype=tf.int32, name="axis", shape=[1])
+    out = tf.expand_dims(input_value, axis=axis_value)
+    return [input_value, axis_value], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value = create_tensor_data(parameters["input_type"],
+                                     parameters["input_shape"])
+    axis_value = np.array([parameters["axis_value"]], dtype=np.int32)
+    return [input_value, axis_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value, axis_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_sparse_to_dense_tests(zip_path):
   """Make a set of tests to do sparse to dense."""
 
@@ -2578,6 +2644,7 @@ def make_sparse_to_dense_tests(zip_path):
 
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 8f0f2e24db..84a5410839 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -507,6 +507,22 @@ class Pad : public BuiltinOperator<PadOperator, ::tflite::PadOptions,
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class Tile
+    : public BuiltinOperator<TensorFlowTileOperator, ::tflite::TileOptions,
+                             ::tflite::BuiltinOptions_TileOptions> {
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateTileOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class PadV2 : public BuiltinOperator<PadV2Operator, ::tflite::PadV2Options,
                                      ::tflite::BuiltinOptions_PadV2Options> {
  public:
@@ -815,6 +831,24 @@ class SparseToDense
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class ExpandDims
+    : public BuiltinOperator<ExpandDimsOperator, ::tflite::ExpandDimsOptions,
+                             ::tflite::BuiltinOptions_ExpandDimsOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateExpandDimsOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class TensorFlowUnsupported : public BaseOperator {
  public:
   using BaseOperator::BaseOperator;
@@ -997,6 +1031,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       new Cast(::tflite::BuiltinOperator_CAST, OperatorType::kCast));
   ops.emplace_back(
       new ArgMax(::tflite::BuiltinOperator_ARG_MAX, OperatorType::kArgMax));
+  ops.emplace_back(
+      new Tile(::tflite::BuiltinOperator_TILE, OperatorType::kTensorFlowTile));
+  ops.emplace_back(new ExpandDims(::tflite::BuiltinOperator_EXPAND_DIMS,
+                                  OperatorType::kExpandDims));
   ops.emplace_back(new TransposeConv(::tflite::BuiltinOperator_TRANSPOSE_CONV,
                                      OperatorType::kTransposeConv));
   ops.emplace_back(new SparseToDense(::tflite::BuiltinOperator_SPARSE_TO_DENSE,
-- 
GitLab


From 03d67b43d3e1432ab6490be75ef49e01c032ed06 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 13:45:49 -0700
Subject: [PATCH 180/610] Add wrapper header file for
 SerialDeviceBatchScheduler

PiperOrigin-RevId: 198919964
---
 tensorflow/contrib/batching/BUILD             |  8 +++++++
 .../batching/serial_device_batch_scheduler.h  | 21 +++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 tensorflow/contrib/batching/serial_device_batch_scheduler.h

diff --git a/tensorflow/contrib/batching/BUILD b/tensorflow/contrib/batching/BUILD
index b6dae3cc1f..b27a19b16c 100644
--- a/tensorflow/contrib/batching/BUILD
+++ b/tensorflow/contrib/batching/BUILD
@@ -49,6 +49,14 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "serial_device_batch_scheduler",
+    hdrs = ["serial_device_batch_scheduler.h"],
+    deps = [
+        "//tensorflow/core/kernels/batching_util:serial_device_batch_scheduler",
+    ],
+)
+
 cc_library(
     name = "basic_batch_scheduler",
     hdrs = ["basic_batch_scheduler.h"],
diff --git a/tensorflow/contrib/batching/serial_device_batch_scheduler.h b/tensorflow/contrib/batching/serial_device_batch_scheduler.h
new file mode 100644
index 0000000000..bf6b708361
--- /dev/null
+++ b/tensorflow/contrib/batching/serial_device_batch_scheduler.h
@@ -0,0 +1,21 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+#define TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
+
+#include "tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h"
+
+#endif  // TENSORFLOW_CONTRIB_BATCHING_SERIAL_DEVICE_BATCH_SCHEDULER_H_
-- 
GitLab


From b2702807daa79e3d97a05fba01e846e128dae0a5 Mon Sep 17 00:00:00 2001
From: Richard Wei <rxwei@google.com>
Date: Fri, 1 Jun 2018 13:49:27 -0700
Subject: [PATCH 181/610] In the Swift API, deprecate `a.dot(b)` and `?` to
 `matmul(a, b)` to accurately reflect the operator?s mathematical properties
 and make it familiar to TensorFlow users. Currently the deprecation is a
 warning - when we update tensorflow/swift-models, I'll start another CL to
 remove it completely.

Previously `dot` was chosen over `matmul` because of naming convention concerns (acronyms aren?t common in Swift) and that we wanted to make it short (so full names like `a.matrixMultiplied(by: b)` isn?t acceptable). Beyond these concerns, `matmul` is really a word of art and thus should be preferred.

The ? operator often denotes outer product and Kronecker product. So it's removed, too.

PiperOrigin-RevId: 198920621
---
 tensorflow/docs_src/community/swift.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
index d1625d3b93..070f9931e0 100644
--- a/tensorflow/docs_src/community/swift.md
+++ b/tensorflow/docs_src/community/swift.md
@@ -21,7 +21,7 @@ import TensorFlow
 var x = Tensor<Float>([[1, 2], [3, 4]])
 
 for i in 1...5 {
-  x += x ⊗ x
+  x += matmul(x, x)
 }
 
 print(x)
-- 
GitLab


From 829aad441d2a9a48e234cd7572d8ad9281034698 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 1 Jun 2018 13:58:11 -0700
Subject: [PATCH 182/610] [TF:XLA] Bump open source llvm revision to r333732

PiperOrigin-RevId: 198921960
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 0672615d5e..e4b7f9a695 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -453,11 +453,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/80f62ff390cc9440ef48ccac94ea6f7f51da3b93.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/80f62ff390cc9440ef48ccac94ea6f7f51da3b93.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/48c1879dcedb834e95a95da8715b30897a49edbe.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/48c1879dcedb834e95a95da8715b30897a49edbe.tar.gz",
       ],
-      sha256 = "119e7d9687a20103088677d5157cf70352392a423943de3cb549f6e4638edc59",
-      strip_prefix = "llvm-80f62ff390cc9440ef48ccac94ea6f7f51da3b93",
+      sha256 = "0e0767199c169f738718461d05d3fdada80b533a6e8e2e07c9ae852356be3c0a",
+      strip_prefix = "llvm-48c1879dcedb834e95a95da8715b30897a49edbe",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From 37ab09a4697ebfda5ce9c8c296090e1d1ffefdda Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 13:58:47 -0700
Subject: [PATCH 183/610] [xla] expose a ConvGeneralDilated op in the local
 Python client

PiperOrigin-RevId: 198922037
---
 tensorflow/compiler/xla/python/xla_client.py  | 55 +++++++++++++++++++
 .../compiler/xla/python/xla_client_test.py    | 40 ++++++++++++++
 2 files changed, 95 insertions(+)

diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 50b548afa5..6a4bae253b 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -1112,6 +1112,61 @@ class ComputationBuilder(object):
     dimension_numbers.output_spatial_dimensions.extend(range(2, 2 + nd))
     return dimension_numbers
 
+  def ConvGeneralDilated(self, lhs, rhs, window_strides, padding, lhs_dilation,
+                         rhs_dilation, dimension_numbers):
+    """Enqueues a ConvGeneralDilated operation onto the computation.
+
+    Args:
+      lhs: LocalOp for the rank N+2 array of inputs.
+      rhs: LocalOp for the rank N+2 array of kernel weights.
+      window_strides: length-N array-like of integer kernel strides.
+      padding: length-N array-like of pairs of integers of (low, high) padding.
+      lhs_dilation: length-N array-like of integer dilation factors.
+      rhs_dilation: length-N array-like of integer dilation factors.
+      dimension_numbers: either an xla_data_pb2.ConvolutionDimensionNumbers or a
+        triple (lhs_spec, rhs_spec, out_spec) where each element is a string of
+        length N+2 identifying by position (1) batch dimensions in lhs, rhs, and
+        the output with the character 'N', (2) feature dimensions in lhs and the
+        output with the character 'C', (3) input and output feature dimensions
+        in rhs with the characters 'I' and 'O' respectively, and (4) spatial
+        dimension correspondences between lhs, rhs, and the output using any
+        distinct characters. For example, to indicate dimension numbers
+        consistent with the Conv operation with two spatial dimensions, one
+        could use ('NCHW', 'OIHW', 'NCHW'). As another example, to indicate
+        dimension numbers consistent with the TensorFlow Conv2D operation, one
+        could use ('NHWC', 'HWIO', 'NHWC'). When using the latter form of
+        convolution dimension specification, window strides are associated with
+        spatial dimension character labels according to the order in which the
+        labels appear in the rhs_spec string, so that window_strides[0] is
+        matched with the dimension corresponding to the first character
+        appearing in rhs_spec that is not 'I' or 'O'.
+
+    Returns: a LocalOp representing the ConvGenralDilated operation.
+    """
+    if not isinstance(dimension_numbers,
+                      xla_data_pb2.ConvolutionDimensionNumbers):
+      lhs_spec, rhs_spec, out_spec = dimension_numbers
+      dimension_numbers = xla_data_pb2.ConvolutionDimensionNumbers()
+
+      dimension_numbers.input_batch_dimension = lhs_spec.index('N')
+      dimension_numbers.input_feature_dimension = lhs_spec.index('C')
+      dimension_numbers.output_batch_dimension = out_spec.index('N')
+      dimension_numbers.output_feature_dimension = out_spec.index('C')
+      dimension_numbers.kernel_output_feature_dimension = rhs_spec.index('O')
+      dimension_numbers.kernel_input_feature_dimension = rhs_spec.index('I')
+
+      dimension_numbers.kernel_spatial_dimensions.extend(
+          i for i, c in enumerate(rhs_spec) if c not in {'I', 'O'})
+      dimension_numbers.input_spatial_dimensions.extend(
+          sorted((i for i, c in enumerate(lhs_spec) if c not in {'N', 'C'}),
+                 key=lambda i: rhs_spec.index(lhs_spec[i])))
+      dimension_numbers.output_spatial_dimensions.extend(
+          sorted((i for i, c in enumerate(out_spec) if c not in {'N', 'C'}),
+                 key=lambda i: rhs_spec.index(out_spec[i])))
+    return self._client.ConvGeneralDilated(lhs, rhs, window_strides, padding,
+                                           lhs_dilation, rhs_dilation,
+                                           dimension_numbers)
+
 
 def _forward_methods_to_local_builder():
   """Forward remaining ComputationBuilder methods to the C API.
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index e3d393bccc..375e720f9b 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -519,6 +519,46 @@ class SingleOpTest(LocalComputationTest):
                          [40., 50., 0.]]]])
     self._ExecuteAndCompareClose(c, expected=result)
 
+  def testConvGeneralDilatedF32(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 1, 2, 3)
+    rhs = a(1, 1, 1, 2) * 10
+    strides = [1, 1]
+    pads = [(1, 0), (0, 1)]
+    lhs_dilation = (2, 1)
+    rhs_dilation = (1, 1)
+    dimension_numbers = ("NCHW", "OIHW", "NCHW")
+    c.ConvGeneralDilated(c.Constant(lhs), c.Constant(rhs),
+                         strides, pads, lhs_dilation, rhs_dilation,
+                         dimension_numbers)
+    result = np.array([[[[0., 0., 0.],
+                         [10., 20., 0.],
+                         [0., 0., 0.],
+                         [40., 50., 0.]]]])
+    self._ExecuteAndCompareClose(c, expected=result)
+
+  def testConvGeneralDilatedPermutedF32(self):
+    c = self._NewComputation()
+    a = lambda *dims: np.arange(np.prod(dims)).reshape(dims).astype("float32")
+    lhs = a(1, 1, 2, 3)
+    rhs = a(1, 1, 1, 2) * 10
+    strides = [1, 1]
+    pads = [(1, 0), (0, 1)]
+    lhs_dilation = (2, 1)
+    rhs_dilation = (1, 1)
+
+    dimension_numbers = ("NHWC", "OIHW", "CWNH")
+    c.ConvGeneralDilated(c.Constant(np.transpose(lhs, (0, 2, 3, 1))),
+                         c.Constant(rhs),
+                         strides, pads, lhs_dilation, rhs_dilation,
+                         dimension_numbers)
+    result = np.array([[[[0., 0., 0.],
+                         [10., 20., 0.],
+                         [0., 0., 0.],
+                         [40., 50., 0.]]]])
+    self._ExecuteAndCompareClose(c, expected=np.transpose(result, (1, 3, 0, 2)))
+
   def testBooleanNot(self):
     c = self._NewComputation()
     arr = NumpyArrayBool([True, False, True])
-- 
GitLab


From d1a3c24745aaf54098b7de3069d65fa92002b221 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 14:11:57 -0700
Subject: [PATCH 184/610] Optimized implementation of dilated convolution.
 Added a DilatedIm2Col() function to leverage GEMM optimizations.

PiperOrigin-RevId: 198924313
---
 .../internal/optimized/optimized_ops.h        | 187 ++++++++++--------
 .../contrib/lite/kernels/internal/types.h     |   8 +
 2 files changed, 116 insertions(+), 79 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index f7011b28fd..0ce781db59 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -1776,6 +1776,100 @@ inline void ExtractPatchIntoBufferColumn(
   }
 }
 
+template <typename T>
+void DilatedIm2col(const T* input_data, const Dims<4>& input_dims,
+                   const Dims<4>& filter_dims, int stride_width,
+                   int stride_height, int dilation_width_factor,
+                   int dilation_height_factor, int pad_width, int pad_height,
+                   const Dims<4>& output_dims, uint8 byte_zero,
+                   T* im2col_data) {
+  // For dilated convolution, the input pixels are not contiguous therefore we
+  // can't use the same opitimizations as Im2Col(). Though note this code would
+  // work fine for the non-dilated case too (though likely a bit slower).
+  gemmlowp::ScopedProfilingLabel label("DilatedIm2col");
+  TFLITE_DCHECK(dilation_width_factor != 1 || dilation_height_factor != 1);
+  TFLITE_DCHECK(IsPackedWithoutStrides(input_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(filter_dims));
+  TFLITE_DCHECK(IsPackedWithoutStrides(output_dims));
+  TFLITE_DCHECK(im2col_data);
+  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  const int input_height = ArraySize(input_dims, 2);
+  const int input_width = ArraySize(input_dims, 1);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int filter_height = ArraySize(filter_dims, 2);
+  const int filter_width = ArraySize(filter_dims, 1);
+  const int output_height = ArraySize(output_dims, 2);
+  const int output_width = ArraySize(output_dims, 1);
+  MatchingArraySize(output_dims, 0, filter_dims, 3);
+
+  // Construct the MxN sized im2col matrix.
+  // The rows M, are sub-ordered B x H x W
+  Dims<4> row_dims;
+  row_dims.sizes[0] = output_width;
+  row_dims.sizes[1] = output_height;
+  row_dims.sizes[2] = batches;
+  row_dims.sizes[3] = 1;
+  ComputeStrides(&row_dims);
+
+  // The columns, N, are sub-ordered Kh x Kw x Din
+  Dims<4> col_dims;
+  col_dims.sizes[0] = input_depth;
+  col_dims.sizes[1] = filter_width;
+  col_dims.sizes[2] = filter_height;
+  col_dims.sizes[3] = 1;
+  ComputeStrides(&col_dims);
+
+  // Use dimensions M and N to construct dims for indexing directly into im2col
+  Dims<4> im2col_dims;
+  im2col_dims.sizes[0] = col_dims.strides[3];
+  im2col_dims.sizes[1] = row_dims.strides[3];
+  im2col_dims.sizes[2] = 1;
+  im2col_dims.sizes[3] = 1;
+  ComputeStrides(&im2col_dims);
+
+  // Loop through the output rows (B x H x W)
+  for (int batch = 0; batch < batches; ++batch) {
+    for (int out_y = 0; out_y < output_height; ++out_y) {
+      for (int out_x = 0; out_x < output_width; ++out_x) {
+        // Each row is an output pixel. Arrange the input data into this row in
+        // an order we can conveniently multiply with the filter data.
+        int row_offset = Offset(row_dims, out_x, out_y, batch, 0);
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        const int in_y_origin = (out_y * stride_height) - pad_height;
+        // Loop through all the pixels of the filter (Kh x Kw)
+        for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          if ((in_y >= 0) && (in_y < input_height)) {
+            // Filter row is within the input data.
+            // Loop through all the filter pixels in this row.
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+              int col_offset = Offset(col_dims, 0, filter_x, filter_y, 0);
+              T* dst = im2col_data +
+                       Offset(im2col_dims, col_offset, row_offset, 0, 0);
+              if ((in_x >= 0) && (in_x < input_width)) {
+                // Filter pixel is within the input, copy the data.
+                T const* src =
+                    input_data + Offset(input_dims, 0, in_x, in_y, batch);
+                memcpy(dst, src, input_depth * sizeof(T));
+              } else {
+                // Filter pixel is outside the input, zero it out.
+                memset(dst, byte_zero, input_depth * sizeof(T));
+              }
+            }
+          } else {
+            // Filter row is outside the input, zero out the entire im2col row.
+            int col_offset = Offset(col_dims, 0, 0, filter_y, 0);
+            T* dst =
+                im2col_data + Offset(im2col_dims, col_offset, row_offset, 0, 0);
+            memset(dst, byte_zero, filter_width * input_depth * sizeof(T));
+          }
+        }
+      }
+    }
+  }
+}
+
 template <typename T>
 void Im2col(const T* input_data, const Dims<4>& input_dims, int stride_width,
             int stride_height, int pad_width, int pad_height, int kheight,
@@ -1816,74 +1910,6 @@ void Im2col(const T* input_data, const Dims<4>& input_dims, int stride,
          kwidth, byte_zero, output_data, output_dims);
 }
 
-inline void DilatedConv(const float* input_data, const Dims<4>& input_dims,
-                        const float* filter_data, const Dims<4>& filter_dims,
-                        const float* bias_data, const Dims<4>& bias_dims,
-                        int stride_width, int stride_height,
-                        int dilation_width_factor, int dilation_height_factor,
-                        int pad_width, int pad_height,
-                        float output_activation_min,
-                        float output_activation_max, float* output_data,
-                        const Dims<4>& output_dims, float* im2col_data,
-                        const Dims<4>& im2col_dims) {
-  gemmlowp::ScopedProfilingLabel label("DilatedConv");
-  // This is a copy of the reference Conv implementation. We do not currently
-  // have an optimized path for dilation.
-  (void)im2col_data;  // only used in optimized code.
-  (void)im2col_dims;  // only used in optimized code.
-  const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
-  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
-  if (bias_data) {
-    TFLITE_DCHECK_EQ(ArraySize(filter_dims, 3), ArraySize(bias_dims, 0));
-  }
-  const int input_height = ArraySize(input_dims, 2);
-  const int input_width = ArraySize(input_dims, 1);
-  const int filter_height = ArraySize(filter_dims, 2);
-  const int filter_width = ArraySize(filter_dims, 1);
-  const int output_height = ArraySize(output_dims, 2);
-  const int output_width = ArraySize(output_dims, 1);
-  for (int batch = 0; batch < batches; ++batch) {
-    for (int out_y = 0; out_y < output_height; ++out_y) {
-      for (int out_x = 0; out_x < output_width; ++out_x) {
-        for (int out_channel = 0; out_channel < output_depth; ++out_channel) {
-          const int in_x_origin = (out_x * stride_width) - pad_width;
-          const int in_y_origin = (out_y * stride_height) - pad_height;
-          float total = 0.f;
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            for (int filter_x = 0; filter_x < filter_width; ++filter_x) {
-              for (int in_channel = 0; in_channel < input_depth; ++in_channel) {
-                const int in_x = in_x_origin + dilation_width_factor * filter_x;
-                const int in_y =
-                    in_y_origin + dilation_height_factor * filter_y;
-                // If the location is outside the bounds of the input image,
-                // use zero as a default value.
-                if ((in_x >= 0) && (in_x < input_width) && (in_y >= 0) &&
-                    (in_y < input_height)) {
-                  float input_value = input_data[Offset(input_dims, in_channel,
-                                                        in_x, in_y, batch)];
-                  float filter_value =
-                      filter_data[Offset(filter_dims, in_channel, filter_x,
-                                         filter_y, out_channel)];
-                  total += (input_value * filter_value);
-                }
-              }
-            }
-          }
-          float bias_value = 0.0f;
-          if (bias_data) {
-            bias_value = bias_data[Offset(bias_dims, out_channel, 0, 0, 0)];
-          }
-          output_data[Offset(output_dims, out_channel, out_x, out_y, batch)] =
-              ActivationFunctionWithMinMax(total + bias_value,
-                                           output_activation_min,
-                                           output_activation_max);
-        }
-      }
-    }
-  }
-}
-
 inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  const float* filter_data, const Dims<4>& filter_dims,
                  const float* bias_data, const Dims<4>& bias_dims,
@@ -1892,29 +1918,32 @@ inline void Conv(const float* input_data, const Dims<4>& input_dims,
                  float output_activation_min, float output_activation_max,
                  float* output_data, const Dims<4>& output_dims,
                  float* im2col_data, const Dims<4>& im2col_dims) {
-  if ((dilation_width_factor != 1) || (dilation_height_factor != 1)) {
-    return DilatedConv(input_data, input_dims, filter_data, filter_dims,
-                       bias_data, bias_dims, stride_width, stride_height,
-                       dilation_width_factor, dilation_height_factor, pad_width,
-                       pad_height, output_activation_min, output_activation_max,
-                       output_data, output_dims, im2col_data, im2col_dims);
-  }
-
   (void)im2col_data;
   (void)im2col_dims;
   gemmlowp::ScopedProfilingLabel label("Conv");
 
+  // A float set to 0x00000000h == 0.0f
+  const uint8 float_zero_byte = 0x00;
   const float* gemm_input_data = nullptr;
   const Dims<4>* gemm_input_dims = nullptr;
   const int filter_width = ArraySize(filter_dims, 1);
   const int filter_height = ArraySize(filter_dims, 2);
+  const bool need_dilated_im2col =
+      dilation_width_factor != 1 || dilation_height_factor != 1;
   const bool need_im2col = stride_width != 1 || stride_height != 1 ||
                            filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
+  if (need_dilated_im2col) {
+    DilatedIm2col(input_data, input_dims, filter_dims, stride_width,
+                  stride_height, dilation_width_factor, dilation_height_factor,
+                  pad_width, pad_height, output_dims, float_zero_byte,
+                  im2col_data);
+    gemm_input_data = im2col_data;
+    gemm_input_dims = &im2col_dims;
+  } else if (need_im2col) {
     TFLITE_DCHECK(im2col_data);
     Im2col(input_data, input_dims, stride_width, stride_height, pad_width,
-           pad_height, filter_height, filter_width, 0, im2col_data,
-           im2col_dims);
+           pad_height, filter_height, filter_width, float_zero_byte,
+           im2col_data, im2col_dims);
     gemm_input_data = im2col_data;
     gemm_input_dims = &im2col_dims;
   } else {
diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index fc8ed753c5..0c7fb7a76a 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -358,6 +358,14 @@ bool IsPackedWithoutStrides(const Dims<N>& dims) {
   return true;
 }
 
+template <int N>
+void ComputeStrides(Dims<N>* dims) {
+  dims->strides[0] = 1;
+  for (int d = 1; d < N; d++) {
+    dims->strides[d] = dims->strides[d - 1] * dims->sizes[d - 1];
+  }
+}
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_TYPES_H_
-- 
GitLab


From 5ab4e1346dba1d5bb820452883c1561d144759f7 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 1 Jun 2018 14:19:03 -0700
Subject: [PATCH 185/610] Updating release notes for r1.9.

---
 RELEASE.md | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/RELEASE.md b/RELEASE.md
index 84d9d52868..600294478d 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,60 @@
+# Release 1.9.0
+
+## Major Features And Improvements
+* Update tf.keras to the Keras 2.1.6 API.
+* `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
+* Adding support of core feature columns and losses to gradient boosted trees estimators.
+* The Bijector API now requires 'event_ndims' passed in to the `log_det_jacobian` methods, while `event_ndims` is removed from the base class and replaced with `forward_min_event_ndims`. The signature is now `log_det_jacobian(x, event_ndims)`. The main rationale for this change is that it allows Bijectors to broadcast.
+RELNOTES: If you were using layers from `tf.keras.layers` in conjunction with custom variable scopes, your layer variable names might have changed. If you were using layers from `tf.layers` in a subclassed `tf.keras.Model` class, then your variable names have changed (you can restore the prior names by importing the same layers from `tf.keras.layers` instead of `tf.layers`).
+
+## Breaking Chances
+  * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
+
+## Bug Fixes and Other Changes
+* `tf.data`:
+  * The `DatasetBase::DebugString()` method is now `const`.
+  * Added the `tf.contrib.data.sample_from_datasets()` API for randomly sampling from multiple datasets.
+* Eager Execution:
+* `tf.keras`:
+  * Move Keras code out of _impl folder and remove API files.
+  * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
+  * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
+* Accelerated Linear Algebra (XLA):
+* TensorFlow Debugger (tfdbg) CLI:
+* `tf.contrib`:
+  * Add `tf.contrib.data.choose_from_datasets()`.
+  * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
+  * `tf.contrib.framework.zero_initializer` supports ResourceVariable.
+  * Adding "constrained_optimization" to tensorflow/contrib.
+* Other:
+  * Add GCS Configuration Ops.
+  * Changing signature of `MakeIterator` to enable propagating error status.
+  * KL divergence for two Dirichlet distributions.
+  * More consistent GcsFileSystem behavior for certain reads past EOF.
+  * Update benchmark for tf.scan to match ranges across eager and graph modes.
+  * Fixed bug in `tf.reduce_prod gradient` for complex dtypes.
+  * Add optional `args` argument to `Dataset.from_generator()`.
+  * Allow the use of '.' in variables (e.g. "hparams.parse('a.b=1.0')"), which would previously raise an error. This will correspond to an attribute name with an embedded '.' symbol (e.g. 'a.b'), which can only be accessed indirectly (e.g. through getattr and setattr).  To set this up the user will first need to explicitly add the variable to the hparam object (e.g. "hparams.add_hparam(name='a.b', value=0.0)").
+  * Benchmark for tf.scan in graph and eager modes.
+  * Added complex128 support to FFT, FFT2D, FFT3D, IFFT, IFFT2D, and IFFT3D.
+  * Making ids unique in `nn.embedding_lookup_sparse`. This helps to reduce RPC calls for looking up the embeddings when there are repeated ids in the batch.
+  * Support indicator column in boosted trees.
+  * Prevent `tf.gradients()` from backpropagating through integer tensors.
+  * LinearOperator[1D,2D,3D]Circulant added to `tensorflow.linalg`.
+  * Conv3D, Conv3DBackpropInput, Conv3DBackpropFilter now supports arbitrary.
+  * Added `tf.train.Checkpoint` for reading/writing object-based checkpoints.
+  * `Dataset.list_files()` now produces determinstic results when `shuffle=False` or a `seed` is passed.
+  * Added LinearOperatorKronecker, a dense-free implementation of the Kronecker Product.
+  * Allow LinearOperator to broadcast.
+  * SavedModelBuilder will now deduplicate asset names that point to files with the same basename and the same contents. Note that this may result in new asset files included in SavedModels in cases where assets with the same name but different contents were previously overwriting each other.
+
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+Abdullah Alrasheed, Achal Shah, Ad-530, ADiegoCAlonso, Aditya Yogi, Ag Ramesh, akindyakov, Andy Kernahan, Anya Petrova, Aurelien Geron, Ben, Ben Barsdell, Bhavani-Subramanian, braincodercn, Brett Koonce, Brian Nemsick, Brian Zier, Bryan Heden, candy.dc, cclauss, Clayne Robison, ctiijima, Dalmo Cirne, David Norman, David T.H. Kao, DosLin, ekelsen, Elson Rodriguez, Erik Smistad, Felix Abecassis, Fergal Cotter, fo40225, foo0x29a, Freedom" Koan-Sin Tan, FréDéRic Branchaud-Charron, gdh1995, Geoffrey Irving, Giuseppe, gracehoney, Guido Zuidhof, Guillaume Klein, Guozhong Zhuang, Haggai, Harald Husum, imsheridan, Ivan Zhang, Jan Zikes, Jayaram Bobba, Jesse Benson, Jesse Gumz, Jiajia Li, Jie, jinghuangintel, Jingwen, jjsjann123, Joe Yearsley, Joel Hestness, Joel Shor, josephyearsley, Junpeng Lao, Karol M. Langner, Kb Sriram, krantideep95, Krish Ravindranath, Letian Feng, Loo Rong Jie, Lukas Geiger, Maciej, Mahmoud Abuzaina, ManHyuk, Mark Ryan, mbhuiyan, Michal Turek, Mostafa Alaa, Myungsung Kwak, Nand Dalal, Nehal J Wani, Neil Tenenholtz, ngc92, Nicholas Nadeau, P.Eng., Avs, Niranjan Hasabnis, P-Hidringer, Paul Van Eck, Peng Yu, Qing Zhao, Qingying Chen, Quanlong, Rajendra Arora, Rholais Lii, rmanyari, Robin Richtsfeld, Russell Klopfer, Sagi, Sam Sendelbach, Sandeep N Gupta, Sandip Giri, Sarah Edkins, Scott Tseng, Sdalbsoo, Sergii Khomenko, Seungwoo Choi (Biggie), Seyed Majid Azimi, Shaoning Zeng, shengfuintel, Siu Kei, Muk, Smit Shilu, soonson, Stefan Schweter, Sukhwan Kim, Sunitha Kambhampati, Taehoon Lee, tamimaddari82, Tang, Wenyi, Ted Chang, u2takey, Utkarsh Upadhyay, Vadim Markovtsev, voegtlel, Wai Hon Law, wangsiyu, Wenhao Hu, wenhao.hu, William D. Irons, Yan Facai (颜发才), Yanbo Liang, Yihong Wang, Yilei (Dolee) Yang, Yong Tang, Yuan (Terry) Tang
+
 # Release 1.8.0
 
 ## Major Features And Improvements
-- 
GitLab


From 672bd9fd8c446eb2c69e4b0f13ed9b74d0a5956f Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Fri, 1 Jun 2018 14:26:07 -0700
Subject: [PATCH 186/610] Updating version for 1.9.0-rc0.

---
 tensorflow/core/public/version.h              |  4 ++--
 tensorflow/docs_src/get_started/eager.md      |  2 +-
 tensorflow/docs_src/install/install_c.md      |  2 +-
 tensorflow/docs_src/install/install_go.md     |  2 +-
 tensorflow/docs_src/install/install_java.md   | 22 +++++++++----------
 tensorflow/docs_src/install/install_linux.md  | 18 +++++++--------
 tensorflow/docs_src/install/install_mac.md    | 10 ++++-----
 .../docs_src/install/install_sources.md       |  9 ++++++--
 tensorflow/tools/docker/Dockerfile.devel      |  2 +-
 .../tools/docker/Dockerfile.devel-cpu-mkl     |  2 +-
 tensorflow/tools/docker/Dockerfile.devel-gpu  |  2 +-
 tensorflow/tools/pip_package/setup.py         |  2 +-
 12 files changed, 41 insertions(+), 36 deletions(-)

diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 522a9d84fd..cb1fd09dbb 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -19,12 +19,12 @@ limitations under the License.
 // TensorFlow uses semantic versioning, see http://semver.org/.
 
 #define TF_MAJOR_VERSION 1
-#define TF_MINOR_VERSION 8
+#define TF_MINOR_VERSION 9
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
 // "-beta", "-rc", "-rc.1")
-#define TF_VERSION_SUFFIX ""
+#define TF_VERSION_SUFFIX "-rc0"
 
 #define TF_STR_HELPER(x) #x
 #define TF_STR(x) TF_STR_HELPER(x)
diff --git a/tensorflow/docs_src/get_started/eager.md b/tensorflow/docs_src/get_started/eager.md
index f08ac74425..bbb25e20c6 100644
--- a/tensorflow/docs_src/get_started/eager.md
+++ b/tensorflow/docs_src/get_started/eager.md
@@ -1,3 +1,3 @@
 # Get Started with Eager Execution
 
-[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.8.0/samples/core/get_started/eager.ipynb)
+[Colab notebook](https://colab.research.google.com/github/tensorflow/models/blob/r1.9.0/samples/core/get_started/eager.ipynb)
diff --git a/tensorflow/docs_src/install/install_c.md b/tensorflow/docs_src/install/install_c.md
index 1abd840ab3..2901848745 100644
--- a/tensorflow/docs_src/install/install_c.md
+++ b/tensorflow/docs_src/install/install_c.md
@@ -38,7 +38,7 @@ enable TensorFlow for C:
          OS="linux" # Change to "darwin" for macOS
          TARGET_DIRECTORY="/usr/local"
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_go.md b/tensorflow/docs_src/install/install_go.md
index 52a2a3f8a6..55bc0f64e7 100644
--- a/tensorflow/docs_src/install/install_go.md
+++ b/tensorflow/docs_src/install/install_go.md
@@ -38,7 +38,7 @@ steps to install this library and enable TensorFlow for Go:
          TF_TYPE="cpu" # Change to "gpu" for GPU support
          TARGET_DIRECTORY='/usr/local'
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-${TF_TYPE}-$(go env GOOS)-x86_64-1.9.0-rc0.tar.gz" |
          sudo tar -C $TARGET_DIRECTORY -xz
 
      The `tar` command extracts the TensorFlow C library into the `lib`
diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1256fb99c4..b3b739212e 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -36,7 +36,7 @@ following to the project's `pom.xml` to use the TensorFlow Java APIs:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>tensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -65,7 +65,7 @@ As an example, these steps will create a Maven project that uses TensorFlow:
                <dependency>
                  <groupId>org.tensorflow</groupId>
                  <artifactId>tensorflow</artifactId>
-                 <version>1.8.0</version>
+                 <version>1.9.0-rc0</version>
                </dependency>
              </dependencies>
          </project>
@@ -124,12 +124,12 @@ instead:
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 <dependency>
   <groupId>org.tensorflow</groupId>
   <artifactId>libtensorflow_jni_gpu</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
 </dependency>
 ```
 
@@ -148,7 +148,7 @@ refer to the simpler instructions above instead.
 Take the following steps to install TensorFlow for Java on Linux or macOS:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
 
   2. Decide whether you will run TensorFlow for Java on CPU(s) only or with
@@ -167,7 +167,7 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
          OS=$(uname -s | tr '[:upper:]' '[:lower:]')
          mkdir -p ./jni
          curl -L \
-           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.8.0.tar.gz" |
+           "https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-${TF_TYPE}-${OS}-x86_64-1.9.0-rc0.tar.gz" |
            tar -xz -C ./jni
 
 ### Install on Windows
@@ -175,10 +175,10 @@ Take the following steps to install TensorFlow for Java on Linux or macOS:
 Take the following steps to install TensorFlow for Java on Windows:
 
   1. Download
-     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.8.0.jar),
+     [libtensorflow.jar](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow-1.9.0-rc0.jar),
      which is the TensorFlow Java Archive (JAR).
   2. Download the following Java Native Interface (JNI) file appropriate for
-     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
+     [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.9.0-rc0.zip).
   3. Extract this .zip file.
 
 
@@ -227,7 +227,7 @@ must be part of your `classpath`. For example, you can include the
 downloaded `.jar` in your `classpath` by using the `-cp` compilation flag
 as follows:
 
-<pre><b>javac -cp libtensorflow-1.8.0.jar HelloTF.java</b></pre>
+<pre><b>javac -cp libtensorflow-1.9.0-rc0.jar HelloTF.java</b></pre>
 
 
 ### Running
@@ -241,11 +241,11 @@ two files are available to the JVM:
 For example, the following command line executes the `HelloTF` program on Linux
 and macOS X:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar:. -Djava.library.path=./jni HelloTF</b></pre>
 
 And the following command line executes the `HelloTF` program on Windows:
 
-<pre><b>java -cp libtensorflow-1.8.0.jar;. -Djava.library.path=jni HelloTF</b></pre>
+<pre><b>java -cp libtensorflow-1.9.0-rc0.jar;. -Djava.library.path=jni HelloTF</b></pre>
 
 If the program prints <tt>Hello from <i>version</i></tt>, you've successfully
 installed TensorFlow for Java and are ready to use the API.  If the program
diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 3b9381625f..2ecab808c4 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -438,7 +438,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
 
      <pre>
      (tensorflow)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl</b></pre>
 
 <a name="ValidateYourInstallation"></a>
 ## Validate your installation
@@ -684,14 +684,14 @@ This section documents the relevant values for Linux installations.
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp27-none-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp27-none-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -703,14 +703,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp34-cp34m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp34-cp34m-linux_x86_64.whl
 </pre>
 
 Note that GPU support requires the NVIDIA hardware and software described in
@@ -722,14 +722,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp35-cp35m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp35-cp35m-linux_x86_64.whl
 </pre>
 
 
@@ -741,14 +741,14 @@ Note that GPU support requires the NVIDIA hardware and software described in
 CPU only:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/cpu/tensorflow-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
 GPU support:
 
 <pre>
-https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.8.0-cp36-cp36m-linux_x86_64.whl
+https://storage.googleapis.com/tensorflow/linux/gpu/tensorflow_gpu-1.9.0rc0-cp36-cp36m-linux_x86_64.whl
 </pre>
 
 
diff --git a/tensorflow/docs_src/install/install_mac.md b/tensorflow/docs_src/install/install_mac.md
index 29a867a9e3..9d01271c5a 100644
--- a/tensorflow/docs_src/install/install_mac.md
+++ b/tensorflow/docs_src/install/install_mac.md
@@ -119,7 +119,7 @@ Take the following steps to install TensorFlow with Virtualenv:
      TensorFlow in the active Virtualenv is as follows:
 
      <pre> $ <b>pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b></pre>
 
 If you encounter installation problems, see
 [Common Installation Problems](#common-installation-problems).
@@ -242,7 +242,7 @@ take the following steps:
      issue the following command:
 
      <pre> $ <b>sudo pip3 install --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl</b> </pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl</b> </pre>
 
      If the preceding command fails, see
      [installation problems](#common-installation-problems).
@@ -350,7 +350,7 @@ Take the following steps to install TensorFlow in an Anaconda environment:
      TensorFlow for Python 2.7:
 
      <pre> (<i>targetDirectory</i>)$ <b>pip install --ignore-installed --upgrade \
-     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl</b></pre>
+     https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl</b></pre>
 
 
 <a name="ValidateYourInstallation"></a>
@@ -522,7 +522,7 @@ The value you specify depends on your Python version.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py2-none-any.whl
 </pre>
 
 
@@ -530,5 +530,5 @@ https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py2-none-any.
 
 
 <pre>
-https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.8.0-py3-none-any.whl
+https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-1.9.0rc0-py3-none-any.whl
 </pre>
diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 5ba522b436..d25e641cee 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -328,10 +328,10 @@ Invoke `pip install` to install that pip package.
 The filename of the `.whl` file depends on your platform.
 For example, the following command will install the pip package
 
-for TensorFlow 1.8.0 on Linux:
+for TensorFlow 1.9.0rc0 on Linux:
 
 <pre>
-$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.8.0-py2-none-any.whl</b>
+$ <b>sudo pip install /tmp/tensorflow_pkg/tensorflow-1.9.0rc0-py2-none-any.whl</b>
 </pre>
 
 ## Validate your installation
@@ -433,6 +433,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Linux**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.11.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.9.0</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>GCC 4.8</td><td>Bazel 0.10.0</td><td>N/A</td><td>N/A</td></tr>
@@ -456,6 +458,7 @@ Stack Overflow and specify the `tensorflow` tag.
 **Mac**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.11.0</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.10.1</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow-1.6.0</td><td>CPU</td><td>2.7, 3.3-3.6</td><td>Clang from xcode</td><td>Bazel 0.8.1</td><td>N/A</td><td>N/A</td></tr>
@@ -472,6 +475,8 @@ Stack Overflow and specify the `tensorflow` tag.
 **Windows**
 <table>
 <tr><th>Version:</th><th>CPU/GPU:</th><th>Python Version:</th><th>Compiler:</th><th>Build Tools:</th><th>cuDNN:</th><th>CUDA:</th></tr>
+<tr><td>tensorflow-1.9.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
+<tr><td>tensorflow_gpu-1.9.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.8.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
 <tr><td>tensorflow_gpu-1.8.0</td><td>GPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>7</td><td>9</td></tr>
 <tr><td>tensorflow-1.7.0</td><td>CPU</td><td>3.5-3.6</td><td>MSVC 2015 update 3</td><td>Cmake v3.6.3</td><td>N/A</td><td>N/A</td></tr>
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index 406d134699..57a491255e 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -76,7 +76,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # TODO(craigcitro): Don't install the pip package, since it makes it
 # more difficult to experiment with local changes. Instead, just add
diff --git a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
index a6cd44ced1..6796ad70e5 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-cpu-mkl
@@ -3,7 +3,7 @@ FROM tensorflow/tensorflow:latest-devel
 LABEL maintainer="Clayne Robison<clayne.b.robison@intel.com>"
 
 # These arguments are parameterized. Use --build-args to override.
-ARG TF_BRANCH=r1.8
+ARG TF_BRANCH=r1.9
 ARG WHL_DIR=/whl
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-gpu b/tensorflow/tools/docker/Dockerfile.devel-gpu
index e4dcce9cdd..204b5b4dba 100644
--- a/tensorflow/tools/docker/Dockerfile.devel-gpu
+++ b/tensorflow/tools/docker/Dockerfile.devel-gpu
@@ -85,7 +85,7 @@ RUN mkdir /bazel && \
 
 # Download and build TensorFlow.
 WORKDIR /tensorflow
-RUN git clone --branch=r1.8 --depth=1 https://github.com/tensorflow/tensorflow.git .
+RUN git clone --branch=r1.9 --depth=1 https://github.com/tensorflow/tensorflow.git .
 
 # Configure the build for our CUDA configuration.
 ENV CI_BUILD_PYTHON python
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index d25a9e77b1..78d955c637 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -45,7 +45,7 @@ DOCLINES = __doc__.split('\n')
 # This version string is semver compatible, but incompatible with pip.
 # For pip, we will remove all '-' characters from this string, and use the
 # result for pip.
-_VERSION = '1.8.0'
+_VERSION = '1.9.0-rc0'
 
 REQUIRED_PACKAGES = [
     'absl-py >= 0.1.6',
-- 
GitLab


From 441979ff0399418b7883ca6c267c08fc716ce74b Mon Sep 17 00:00:00 2001
From: Roy Frostig <frostig@google.com>
Date: Fri, 1 Jun 2018 14:56:17 -0700
Subject: [PATCH 187/610] [XLA] Add an unoptimized HLO output flag to
 ExecutableBuildOptions and to the XLA local Python client.

PiperOrigin-RevId: 198930874
---
 .../compiler/xla/client/executable_build_options.cc  | 12 ++++++++++++
 .../compiler/xla/client/executable_build_options.h   |  8 ++++++++
 .../compiler/xla/python/local_computation_builder.i  |  5 +++++
 tensorflow/compiler/xla/python/xla_client.py         |  1 +
 tensorflow/compiler/xla/service/local_service.cc     |  5 +++++
 5 files changed, 31 insertions(+)

diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 6e3c5cb484..7dee41f6a0 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -87,6 +87,18 @@ ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
   return dump_optimized_hlo_proto_to_;
 }
 
+ExecutableBuildOptions&
+ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to(
+    tensorflow::StringPiece dirpath) {
+  dump_unoptimized_hlo_proto_to_ = dirpath.ToString();
+  return *this;
+}
+
+const tensorflow::gtl::optional<string>&
+ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const {
+  return dump_unoptimized_hlo_proto_to_;
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
     tensorflow::StringPiece dirpath) {
   dump_per_pass_hlo_proto_to_ = dirpath.ToString();
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 393da381fb..9dc9be4423 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -65,6 +65,13 @@ class ExecutableBuildOptions {
       tensorflow::StringPiece dirpath);
   const tensorflow::gtl::optional<string>& dump_optimized_hlo_proto_to() const;
 
+  // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO
+  // protobuf to (as in DebugOptions).
+  ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to(
+      tensorflow::StringPiece dirpath);
+  const tensorflow::gtl::optional<string>& dump_unoptimized_hlo_proto_to()
+      const;
+
   // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
   // to (as in DebugOptions).
   ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
@@ -95,6 +102,7 @@ class ExecutableBuildOptions {
   bool result_layout_set_ = false;
   tensorflow::gtl::optional<string> generate_hlo_graph_;
   tensorflow::gtl::optional<string> dump_optimized_hlo_proto_to_;
+  tensorflow::gtl::optional<string> dump_unoptimized_hlo_proto_to_;
   tensorflow::gtl::optional<string> dump_per_pass_hlo_proto_to_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
   std::vector<std::string> disabled_hlo_passes_;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 51412ca474..536b93c6f9 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -851,6 +851,11 @@ tensorflow::ImportNumpy();
     })) {
       return nullptr;
     }
+    if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
+      build_options.set_dump_unoptimized_hlo_proto_to(std::move(s));
+    })) {
+      return nullptr;
+    }
     if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
       build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
     })) {
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 6a4bae253b..11611ac612 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -353,6 +353,7 @@ class CompileOptions(object):
   def __init__(self):
     self.generate_hlo_graph = None
     self.dump_optimized_hlo_proto_to = None
+    self.dump_unoptimized_hlo_proto_to = None
     self.dump_per_pass_hlo_proto_to = None
     self.hlo_profile = False
 
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 375c4a6780..1d9c9e0678 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -108,6 +108,11 @@ ExecutionOptions CreateExecutionOptions(
         ->set_xla_dump_optimized_hlo_proto_to(
             build_options.dump_optimized_hlo_proto_to().value());
   }
+  if (build_options.dump_unoptimized_hlo_proto_to().has_value()) {
+    execution_options.mutable_debug_options()
+        ->set_xla_dump_unoptimized_hlo_proto_to(
+            build_options.dump_unoptimized_hlo_proto_to().value());
+  }
   if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
     execution_options.mutable_debug_options()
         ->set_xla_dump_per_pass_hlo_proto_to(
-- 
GitLab


From af1d59aff9bf3b43dfff4d99e50d22f527201e76 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 15:29:06 -0700
Subject: [PATCH 188/610] DepthwiseConv Optimizations

PiperOrigin-RevId: 198935499
---
 .../depthwiseconv_uint8_3x3_filter.h          | 920 +++++++++++++++++-
 1 file changed, 891 insertions(+), 29 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 8cd72239e9..a7b0d805a3 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -42,6 +42,7 @@ struct DepthwiseConvParams {
   int64_t input_row_size;
   int64_t output_depth;
   int64_t output_row_size;
+  int64_t filter_row_size;
   int32 input_offset;
   int32 output_offset;
   int32 filter_offset;
@@ -51,6 +52,8 @@ struct DepthwiseConvParams {
   int32 output_shift;
   int32 input_width;
   int32 input_height;
+  int32 stride_width;
+  int32 stride_height;
   int32 output_width;
   int32 output_height;
 };
@@ -65,17 +68,20 @@ struct DepthwiseConvParams {
 #define OFFSET_INPUT_ROW_SIZE 8
 #define OFFSET_OUTPUT_DEPTH 16
 #define OFFSET_OUTPUT_ROW_SIZE 24
-#define OFFSET_INPUT_OFFSET 32
-#define OFFSET_OUTPUT_OFFSET 36
-#define OFFSET_FILTER_OFFSET 40
-#define OFFSET_OUTPUT_MULTIPLIER 44
-#define OFFSET_OUTPUT_ACTIVATION_MIN 48
-#define OFFSET_OUTPUT_ACTIVATION_MAX 52
-#define OFFSET_OUTPUT_SHIFT 56
-#define OFFSET_INPUT_WIDTH 60
-#define OFFSET_INPUT_HEIGHT 64
-#define OFFSET_OUTPUT_WIDTH 68
-#define OFFSET_OUTPUT_HEIGHT 72
+#define OFFSET_FILTER_ROW_SIZE 32
+#define OFFSET_INPUT_OFFSET 40
+#define OFFSET_OUTPUT_OFFSET 44
+#define OFFSET_FILTER_OFFSET 48
+#define OFFSET_OUTPUT_MULTIPLIER 52
+#define OFFSET_OUTPUT_ACTIVATION_MIN 56
+#define OFFSET_OUTPUT_ACTIVATION_MAX 60
+#define OFFSET_OUTPUT_SHIFT 64
+#define OFFSET_INPUT_WIDTH 68
+#define OFFSET_INPUT_HEIGHT 72
+#define OFFSET_STRIDE_WIDTH 76
+#define OFFSET_STRIDE_HEIGHT 80
+#define OFFSET_OUTPUT_WIDTH 84
+#define OFFSET_OUTPUT_HEIGHT 88
 
 static_assert(offsetof(DepthwiseConvParams, input_depth) ==
                   OFFSET_INPUT_DEPTH, "");
@@ -85,6 +91,8 @@ static_assert(offsetof(DepthwiseConvParams, output_depth) ==
                   OFFSET_OUTPUT_DEPTH, "");
 static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
                   OFFSET_OUTPUT_ROW_SIZE, "");
+static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
+                  OFFSET_FILTER_ROW_SIZE, "");
 static_assert(offsetof(DepthwiseConvParams, input_offset) ==
                   OFFSET_INPUT_OFFSET, "");
 static_assert(offsetof(DepthwiseConvParams, output_offset) ==
@@ -103,6 +111,10 @@ static_assert(offsetof(DepthwiseConvParams, input_width) ==
                   OFFSET_INPUT_WIDTH, "");
 static_assert(offsetof(DepthwiseConvParams, input_height) ==
                   OFFSET_INPUT_HEIGHT, "");
+static_assert(offsetof(DepthwiseConvParams, stride_width) ==
+                  OFFSET_STRIDE_WIDTH, "");
+static_assert(offsetof(DepthwiseConvParams, stride_height) ==
+                  OFFSET_STRIDE_HEIGHT, "");
 static_assert(offsetof(DepthwiseConvParams, output_width) ==
                   OFFSET_OUTPUT_WIDTH, "");
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
@@ -114,7 +126,7 @@ struct DepthwiseConvWindow {};
 template <>
 struct DepthwiseConvWindow<8, 1, 1> {
  public:
-  static void Run(const uint8* input_ptr, const uint8* filter_ptr,
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
                   const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
                   int64_t input_row_size, int32 output_window_height,
                   int32 output_window_width,
@@ -1097,7 +1109,7 @@ struct DepthwiseConvWindow<8, 1, 1> {
 
 template <>
 struct DepthwiseConvWindow<8, 2, 2> {
-  static void Run(const uint8* input_ptr, const uint8* filter_ptr,
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
                   const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
                   int64_t input_row_size, int32 output_window_height,
                   int32 output_window_width,
@@ -2179,6 +2191,715 @@ struct DepthwiseConvWindow<8, 2, 2> {
   }
 };
 
+enum class EdgeType { kCorner, kHorizontal, kVertical, kCenter };
+
+template <EdgeType kEdgeType, int kPadWidth, int kPadHeight>
+struct DepthwiseConvPartial {};
+
+template <>
+struct DepthwiseConvPartial<EdgeType::kCenter, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 1x1 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the 1x1 input and filter values.
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w10\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "cmp x11, #16\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w10, w10\n"
+        "dup v29.4s, w10\n"
+        "ldr w10, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w9\n"
+        "ldr w9, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w10\n"
+        "dup v25.8h, w9\n"
+
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x11, x11, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x11, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v8", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28",
+        "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x9", "x10", "x11");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<EdgeType::kCorner, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "ldr x9, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "cmp x15, #16\n"
+        "add x12, %[input_ptr], x15\n"
+        "add x13, %[input_ptr], x9\n"
+        "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+        "add x14, x13, x15\n"
+        "ld1 {v9.8b}, [x12], #8\n"
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+
+        "add x9, %[filter_ptr], x15\n"
+        "ld1 {v10.8b}, [x13], #8\n"
+        "add x10, %[filter_ptr], x6\n"
+        "ld1 {v11.8b}, [x14], #8\n"
+        "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+        "add x11, x10, x15\n"
+        "ld1 {v1.8b}, [x9], #8\n"
+        "ld1 {v2.8b}, [x10], #8\n"
+        "ld1 {v3.8b}, [x11], #8\n"
+
+        // Load constants.
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w7, w7\n"
+        "dup v29.4s, w7\n"
+        "ldr w7, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w6\n"
+        "ldr w6, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w7\n"
+        "dup v25.8h, w6\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "subs x15, x15, #8\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [%[input_ptr]], #8\n"
+          "cmp x15, #16\n"
+          "ld1 {v0.8b}, [%[filter_ptr]], #8\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], #8\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "ld1 {v1.8b}, [x9], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], #8\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v2.8b}, [x10], #8\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x14], #8\n"
+          "ld1 {v3.8b}, [x11], #8\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v8", "v9", "v10", "v11", "v16", "v17", "v18",
+        "v19", "v25", "v26", "v27", "v28", "v29", "v30", "v31",
+        // We use these general-purpose registers.
+        "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 2x3 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 2x3 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x7, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x9, %[filter_ptr]\n"
+        "ldr x14, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+
+        "ld1 {v8.8b}, [x12], x7\n"
+        "add x10, x9, x14\n"
+        "ld1 {v9.8b}, [x12], x7\n"
+        "cmp x15, #16\n"
+        "ld1 {v10.8b}, [x12]\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13], x7\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x13], x7\n"
+        "ld1 {v13.8b}, [x13]\n"
+
+        "ld1 {v0.8b}, [x9], x7\n"
+        "ld1 {v1.8b}, [x9], x7\n"
+        "ld1 {v2.8b}, [x9]\n"
+        "ld1 {v3.8b}, [x10], x7\n"
+        "ld1 {v4.8b}, [x10], x7\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w13, w13\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x9, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x7\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x10, x9, x14\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "ld1 {v9.8b}, [x12], x7\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x12]\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13], x7\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x9], x7\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x13], x7\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9]\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x13]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x10], x7\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x7\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
+template <>
+struct DepthwiseConvPartial<EdgeType::kVertical, 1, 1> {
+  static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
+                         const int32* bias_ptr, uint8* output_ptr,
+                         const DepthwiseConvParams* params_ptr) {
+#define DEPTHWISECONV_LABEL_DEPTH_8_LOOP "1"
+#define DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "2"
+    asm volatile(
+        // Performs depthwise convolutions for an input window of size 3x2 and
+        // padding of 1 across the full depth. Expects |input_ptr| and
+        // |filter_ptr| to be pointing to the beginning of the 3x2 input and
+        // filter values.
+
+        // Load input and filter values.
+        "ldr x6, [%[params_ptr], #" STR(OFFSET_INPUT_DEPTH) "]\n"
+        "mov x12, %[input_ptr]\n"
+        "ldr x11, [%[params_ptr], #" STR(OFFSET_INPUT_ROW_SIZE) "]\n"
+        "mov x7, %[filter_ptr]\n"
+        "ldr x5, [%[params_ptr], #" STR(OFFSET_FILTER_ROW_SIZE) "]\n"
+        "add x13, x12, x11\n"
+        "ldr x15, [%[params_ptr], #" STR(OFFSET_OUTPUT_DEPTH) "]\n"
+        "add x14, x13, x11\n"
+
+        "ld1 {v8.8b}, [x12], x6\n"
+        "add x9, x7, x5\n"
+        "ld1 {v9.8b}, [x12]\n"
+        "cmp x15, #16\n"
+        "add x10, x9, x5\n"
+        "ld1 {v10.8b}, [x13], x6\n"
+        "add %[input_ptr], %[input_ptr], #8\n"
+        "ld1 {v11.8b}, [x13]\n"
+        "add %[filter_ptr], %[filter_ptr], #8\n"
+        "ld1 {v12.8b}, [x14], x6\n"
+        "ld1 {v13.8b}, [x14]\n"
+
+        "ld1 {v0.8b}, [x7], x6\n"
+        "ld1 {v1.8b}, [x7]\n"
+        "ld1 {v2.8b}, [x9], x6\n"
+        "ld1 {v3.8b}, [x9]\n"
+        "ld1 {v4.8b}, [x10], x6\n"
+        "ld1 {v5.8b}, [x10]\n"
+
+        // Load constants.
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_INPUT_OFFSET) "]\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_MULTIPLIER) "]\n"
+        "dup v26.8h, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_OFFSET) "]\n"
+        "dup v27.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_SHIFT) "]\n"
+        "dup v28.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MIN) "]\n"
+        "neg w13, w13\n"
+        "dup v29.4s, w13\n"
+        "ldr w13, [%[params_ptr], #" STR(OFFSET_OUTPUT_ACTIVATION_MAX) "]\n"
+        "dup v30.4s, w12\n"
+        "ldr w12, [%[params_ptr], #" STR(OFFSET_FILTER_OFFSET) "]\n"
+        "dup v31.4s, w13\n"
+        "dup v25.8h, w12\n"
+
+        // Add input and filter offsets.
+        "uaddw v8.8h, v26.8h, v8.8b\n"
+        "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v9.8h, v26.8h, v9.8b\n"
+        "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+        "uaddw v10.8h, v26.8h, v10.8b\n"
+        "uaddw v11.8h, v26.8h, v11.8b\n"
+        "uaddw v12.8h, v26.8h, v12.8b\n"
+        "uaddw v13.8h, v26.8h, v13.8b\n"
+
+        "uaddw v0.8h, v25.8h, v0.8b\n"
+        "uaddw v1.8h, v25.8h, v1.8b\n"
+        "uaddw v2.8h, v25.8h, v2.8b\n"
+        "uaddw v3.8h, v25.8h, v3.8b\n"
+        "uaddw v4.8h, v25.8h, v4.8b\n"
+        "uaddw v5.8h, v25.8h, v5.8b\n"
+
+        "blt " DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP "f\n"
+
+        //"loop_%=:\n"
+        DEPTHWISECONV_LABEL_DEPTH_8_LOOP ":\n"
+          "mov x12, %[input_ptr]\n"
+          "subs x15, x15, #8\n"
+          "add x13, x12, x11\n"
+          "cmp x15, #16\n"
+          "add x14, x13, x11\n"
+          "add %[input_ptr], %[input_ptr], #8\n"
+
+          "smlal v16.4s, v0.4h, v8.4h\n"
+          "mov x7, %[filter_ptr]\n"
+          "smlal2 v17.4s, v0.8h, v8.8h\n"
+          "ld1 {v8.8b}, [x12], x6\n"
+          "smlal v16.4s, v1.4h, v9.4h\n"
+          "add x9, x7, x5\n"
+          "smlal2 v17.4s, v1.8h, v9.8h\n"
+          "add x10, x9, x5\n"
+          "ld1 {v9.8b}, [x12]\n"
+          "smlal v16.4s, v2.4h, v10.4h\n"
+          "add %[filter_ptr], %[filter_ptr], #8\n"
+          "smlal2 v17.4s, v2.8h, v10.8h\n"
+          "ld1 {v10.8b}, [x13], x6\n"
+          "smlal v16.4s, v3.4h, v11.4h\n"
+          "ld1 {v0.8b}, [x7], x6\n"
+          "smlal2 v17.4s, v3.8h, v11.8h\n"
+          "ld1 {v11.8b}, [x13]\n"
+          "smlal v16.4s, v4.4h, v12.4h\n"
+          "ld1 {v1.8b}, [x7]\n"
+          "smlal2 v17.4s, v4.8h, v12.8h\n"
+          "ld1 {v12.8b}, [x14], x6\n"
+          "smlal v16.4s, v5.4h, v13.4h\n"
+          "ld1 {v2.8b}, [x9], x6\n"
+          "smlal2 v17.4s, v5.8h, v13.8h\n"
+          "ld1 {v13.8b}, [x14]\n"
+
+          "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+          "ld1 {v3.8b}, [x9]\n"
+          "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+          "ld1 {v4.8b}, [x10], x6\n"
+          "and v18.16b, v16.16b, v29.16b\n"
+          "ld1 {v5.8b}, [x10]\n"
+          "and v19.16b, v17.16b, v29.16b\n"
+          "sshr v18.4s, v18.4s, #31\n"
+          "sshr v19.4s, v19.4s, #31\n"
+          "sqadd v16.4s, v16.4s, v18.4s\n"
+          "sqadd v17.4s, v17.4s, v19.4s\n"
+          "srshl v16.4s, v16.4s, v29.4s\n"
+          "srshl v17.4s, v17.4s, v29.4s\n"
+          "add v16.4s, v16.4s, v28.4s\n"
+          "add v17.4s, v17.4s, v28.4s\n"
+          "smax v16.4s, v16.4s, v30.4s\n"
+          "smax v17.4s, v17.4s, v30.4s\n"
+          "smin v16.4s, v16.4s, v31.4s\n"
+          "smin v17.4s, v17.4s, v31.4s\n"
+          "sqxtn v16.4h, v16.4s\n"
+          "sqxtn2 v16.8h, v17.4s\n"
+          "sqxtun v16.8b, v16.8h\n"
+          "uaddw v8.8h, v26.8h, v8.8b\n"
+          "st1 {v16.8b}, [%[output_ptr]], #8\n"
+          "uaddw v9.8h, v26.8h, v9.8b\n"
+          "uaddw v10.8h, v26.8h, v10.8b\n"
+          "uaddw v11.8h, v26.8h, v11.8b\n"
+          "uaddw v12.8h, v26.8h, v12.8b\n"
+          "uaddw v13.8h, v26.8h, v13.8b\n"
+
+          "uaddw v0.8h, v25.8h, v0.8b\n"
+          "uaddw v1.8h, v25.8h, v1.8b\n"
+          "uaddw v2.8h, v25.8h, v2.8b\n"
+          "ld1 {v16.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v3.8h, v25.8h, v3.8b\n"
+          "ld1 {v17.4s}, [%[bias_ptr]], #16\n"
+          "uaddw v4.8h, v25.8h, v4.8b\n"
+          "uaddw v5.8h, v25.8h, v5.8b\n"
+
+          "bge " DEPTHWISECONV_LABEL_DEPTH_8_LOOP "b\n"
+
+        DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP ":\n"
+        "smlal v16.4s, v0.4h, v8.4h\n"
+        "smlal2 v17.4s, v0.8h, v8.8h\n"
+        "smlal v16.4s, v1.4h, v9.4h\n"
+        "smlal2 v17.4s, v1.8h, v9.8h\n"
+        "smlal v16.4s, v2.4h, v10.4h\n"
+        "smlal2 v17.4s, v2.8h, v10.8h\n"
+        "smlal v16.4s, v3.4h, v11.4h\n"
+        "smlal2 v17.4s, v3.8h, v11.8h\n"
+        "smlal v16.4s, v4.4h, v12.4h\n"
+        "smlal2 v17.4s, v4.8h, v12.8h\n"
+        "smlal v16.4s, v5.4h, v13.4h\n"
+        "smlal2 v17.4s, v5.8h, v13.8h\n"
+
+        "sqrdmulh v16.4s, v16.4s, v27.4s\n"
+        "sqrdmulh v17.4s, v17.4s, v27.4s\n"
+        "and v18.16b, v16.16b, v29.16b\n"
+        "and v19.16b, v17.16b, v29.16b\n"
+        "sshr v18.4s, v18.4s, #31\n"
+        "sshr v19.4s, v19.4s, #31\n"
+        "sqadd v16.4s, v16.4s, v18.4s\n"
+        "sqadd v17.4s, v17.4s, v19.4s\n"
+        "srshl v16.4s, v16.4s, v29.4s\n"
+        "srshl v17.4s, v17.4s, v29.4s\n"
+        "add v16.4s, v16.4s, v28.4s\n"
+        "add v17.4s, v17.4s, v28.4s\n"
+        "smax v16.4s, v16.4s, v30.4s\n"
+        "smax v17.4s, v17.4s, v30.4s\n"
+        "smin v16.4s, v16.4s, v31.4s\n"
+        "smin v17.4s, v17.4s, v31.4s\n"
+        "sqxtn v16.4h, v16.4s\n"
+        "sqxtn2 v16.8h, v17.4s\n"
+        "sqxtun v16.8b, v16.8h\n"
+        "st1 {v16.8b}, [%[output_ptr]]\n"
+        :
+        // Outputs.
+        [filter_ptr] "+r"(filter_ptr), [input_ptr] "+r"(input_ptr),
+        [output_ptr] "+r"(output_ptr), [bias_ptr] "+r"(bias_ptr)
+        :
+        // Inputs.
+        [params_ptr] "r"(params_ptr)
+        :
+        // Clobbers.
+        "cc", "memory",
+        // We use these NEON registers.
+        "v0", "v1", "v2", "v3", "v4", "v5", "v8", "v9", "v10", "v11", "v12",
+        "v13", "v16", "v17", "v18", "v19", "v25", "v26", "v27", "v28", "v29",
+        "v30", "v31",
+        // We use these general-purpose registers.
+        "x5", "x6", "x7", "x9", "x10", "x11", "x12", "x13", "x14", "x15");
+#undef DEPTHWISECONV_LABEL_DEPTH_8_LOOP
+#undef DEPTHWISECONV_LABEL_DEPTH_8_AFTER_LOOP
+  }
+};
+
 #undef OFFSET_INPUT_DEPTH
 #undef OFFSET_INPUT_ROW_SIZE
 #undef OFFSET_OUTPUT_DEPTH
@@ -2266,7 +2987,7 @@ template <int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvMultiRow {
   using ConvKernel = DepthwiseConvThroughDepth<kStrideWidth, kStrideHeight>;
 
-  static inline void Run(const uint8* input_data, int32 start_x, int32 start_y,
+  static inline void Run(const uint8* input_data, int32 start_x, int32 end_x,
                          const uint8* filter_data, const int32* bias_data,
                          uint8* output_data, const DepthwiseConvParams& params,
                          const ShuffleParams& shuffle_params,
@@ -2286,7 +3007,7 @@ struct DepthwiseConvMultiRow {
     // preshuffle the input data to maximize locality.
     if (params.output_depth > 64 ||
         (params.output_depth <= 64 && params.input_width > 150)) {
-      for (; out_x <= (params.output_width - shuffle_params.output_width);
+      for (; out_x <= (end_x - shuffle_params.output_width);
              out_x += shuffle_params.output_width) {
         const uint8* input_ptr = input_data;
         const int32* bias_ptr = bias_data;
@@ -2344,7 +3065,7 @@ struct DepthwiseConvMultiRow {
       }
     }
 
-    const int32 output_leftover_width = params.output_width - out_x;
+    const int32 output_leftover_width = end_x - out_x;
     if (output_leftover_width > 0) {
       ConvKernel::Run(input_data, filter_data, bias_data, output_data, 0,
                       params.output_depth, params.input_depth,
@@ -2354,6 +3075,105 @@ struct DepthwiseConvMultiRow {
   }
 };
 
+// Processes the borders of the input for pad_width and pad_height = 1.
+// Calls 4 asm kernels:
+//   * 1x1 input shape.
+//   * Corner edges.
+//   * Horizontal edges.
+//   * Vertical edges.
+inline void DepthwiseConvHandlePadding(const uint8* input_data,
+    const uint8* filter_data, const int32* bias_data, uint8* output_data,
+    const DepthwiseConvParams& params) {
+  if (params.input_width == 1 && params.input_height == 1) {
+    const uint8* filter_ptr = filter_data + params.filter_row_size
+        + params.output_depth;
+    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(input_data, filter_ptr,
+        bias_data, output_data, &params);
+    return;
+  }
+
+  const int32 out_x_start_corner = 0;
+  const int32 out_x_end_corner = params.output_width - 1;
+  const int32 out_y_start_corner = 0;
+  const int32 out_y_end_corner = params.output_height - 1;
+
+  // Handle top row.
+  const uint8* input_ptr = input_data;
+  const uint8* filter_ptr = filter_data + params.filter_row_size
+      + params.output_depth;
+  uint8* output_ptr = output_data;
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width - 1) * params.input_depth;
+  filter_ptr = filter_data + params.filter_row_size;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+           out_x++) {
+    DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+
+  // Handle left side.
+  input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data + params.input_depth;
+  output_ptr = output_data + params.output_row_size;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+           out_y++) {
+    DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle right side.
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth
+      + (params.stride_width - 1) * params.input_row_size;
+  filter_ptr = filter_data;
+  output_ptr = output_data + params.output_row_size +
+      (params.output_width - 1) * params.output_depth;
+
+  for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
+         out_y++) {
+    DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_row_size;
+    output_ptr += params.output_row_size;
+  }
+
+  // Handle bottom row.
+  input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
+  filter_ptr = filter_data + params.output_depth;
+  output_ptr = output_data +
+      (params.output_height - 1) * params.output_row_size;
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+
+  input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
+  filter_ptr = filter_data;
+  output_ptr += params.output_depth;
+
+  for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
+           out_x++) {
+    DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
+        input_ptr, filter_ptr, bias_data, output_ptr, &params);
+    input_ptr += params.stride_width * params.input_depth;
+    output_ptr += params.output_depth;
+  }
+
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
+      bias_data, output_ptr, &params);
+}
+
 inline bool Fast3x3FilterKernelSupported(
     const Dims<4>& input_dims, const Dims<4>& filter_dims, int32 stride_width,
     int32 stride_height, int32 pad_width, int32 pad_height,
@@ -2370,7 +3190,8 @@ inline bool Fast3x3FilterKernelSupported(
       filter_width == 3 && filter_height == 3 && depth_multiplier == 1 &&
       (stride_width == 1 || stride_width == 2) &&
       (stride_height == 1 || stride_height == 2) &&
-      (stride_width == stride_height) && pad_width == 0 && pad_height == 0 &&
+      (stride_width == stride_height) && (pad_width == 0 || pad_width == 1) &&
+      (pad_height == 0 || pad_height == 1) && (pad_width == pad_height) &&
       (input_depth % 8) == 0 && (output_shift > 0);
 
   if (!supported) {
@@ -2390,8 +3211,26 @@ inline bool Fast3x3FilterKernelSupported(
   const int32 in_y_end = in_y_origin + filter_height;
 
   // Supported only if filter on the right and bottom boundary lies completely
-  // within the input.
-  return in_x_end <= input_width && in_y_end <= input_height;
+  // within the input if padding is zero.
+  if (pad_width == 0 && pad_height == 0) {
+    return in_x_end <= input_width && in_y_end <= input_height;
+  }
+
+  // Else if padding is 1, supported if bottom right filter lies +1 past input
+  // width and height.
+  supported = in_x_end <= (input_width + 1) && in_y_end <= (input_height + 1);
+
+  if (!supported) {
+    return false;
+  }
+
+  // Shapes with width 1 and height > 1, and vice versa are not supported yet.
+  if (input_width == 1) {
+    supported = (input_width == input_height);
+  } else if (input_height == 1) {
+    supported = (input_width == input_height);
+  }
+  return supported;
 }
 
 inline void DepthwiseConv3x3Filter(
@@ -2409,6 +3248,8 @@ inline void DepthwiseConv3x3Filter(
   params.input_height = ArraySize(input_dims, 2);
   params.input_row_size = params.input_depth * params.input_width;
   params.input_offset = input_offset;
+  params.stride_width = stride_width;
+  params.stride_height = stride_height;
   params.output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
   params.output_width = ArraySize(output_dims, 1);
   params.output_height = ArraySize(output_dims, 2);
@@ -2422,6 +3263,7 @@ inline void DepthwiseConv3x3Filter(
 
   const int32 filter_height = ArraySize(filter_dims, 2);
   const int32 filter_width = ArraySize(filter_dims, 1);
+  params.filter_row_size = params.output_depth * filter_width;
 
   // Algorithm assumes below constraints. It is optimized for depth
   // multiplier of 1, 3x3 filter, no padding and strides 1 and 2.
@@ -2432,8 +3274,9 @@ inline void DepthwiseConv3x3Filter(
   TFLITE_DCHECK(stride_height == 1 || stride_height == 2);
   TFLITE_DCHECK(stride_width == 1 || stride_width == 2);
   TFLITE_DCHECK(stride_width == stride_height);
-  TFLITE_DCHECK(pad_height == 0);
-  TFLITE_DCHECK(pad_width == 0);
+  TFLITE_DCHECK(pad_height == 0 || pad_height == 1);
+  TFLITE_DCHECK(pad_width == 0 || pad_width == 1);
+  TFLITE_DCHECK(pad_width == pad_height);
 
   const int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   const int64_t input_batch_size = params.input_row_size * params.input_height;
@@ -2471,7 +3314,26 @@ inline void DepthwiseConv3x3Filter(
     const uint8* input_ptr = input_data + b * input_batch_size;
     uint8* output_ptr = output_data + b * output_batch_size;
 
+    int32 out_x = 0;
     int32 out_y = 0;
+    int32 end_x = params.output_width;
+    int32 end_y = params.output_height;
+
+    if (pad_width == 1 && pad_height == 1) {
+      DepthwiseConvHandlePadding(input_ptr, filter_data, bias_data, output_ptr,
+                                 params);
+
+      // Update extents now that the edges have been handled.
+      out_x = 1;
+      end_x = params.output_width - 1;
+      out_y = 1;
+      end_y = params.output_height - 1;
+      const int in_x = (out_x * stride_width) - pad_width;
+      const int in_y = (out_y * stride_height) - pad_height;
+      input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
+      output_ptr += out_y * params.output_row_size
+          + out_x * params.output_depth;
+    }
 
     // Shuffling shapes that maximize width over the shuffle workspace size
     // perform better since the inputs are closer together, minimizing
@@ -2486,8 +3348,8 @@ inline void DepthwiseConv3x3Filter(
 
     // Handle 8 rows at a time.
     if (params.input_width < four_row_shuffle_params.input_width) {
-      for (; out_y <= params.output_height - 8; out_y += 8) {
-        conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+      for (; out_y <= end_y - 8; out_y += 8) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
                            output_ptr, params, eight_row_shuffle_params,
                            shuffle_workspace);
         input_ptr += 8 * stride_height * params.input_row_size;
@@ -2497,8 +3359,8 @@ inline void DepthwiseConv3x3Filter(
 
     // Handle 4 rows at a time.
     if (params.input_width < two_row_shuffle_params.input_width) {
-      for (; out_y <= params.output_height - 4; out_y += 4) {
-        conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+      for (; out_y <= end_y - 4; out_y += 4) {
+        conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
                            output_ptr, params, four_row_shuffle_params,
                            shuffle_workspace);
         input_ptr += 4 * stride_height * params.input_row_size;
@@ -2507,8 +3369,8 @@ inline void DepthwiseConv3x3Filter(
     }
 
     // Handle 2 rows at a time.
-    for (; out_y <= params.output_height - 2; out_y += 2) {
-      conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+    for (; out_y <= end_y - 2; out_y += 2) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
                          output_ptr, params, two_row_shuffle_params,
                          shuffle_workspace);
       input_ptr += 2 * stride_height * params.input_row_size;
@@ -2516,8 +3378,8 @@ inline void DepthwiseConv3x3Filter(
     }
 
     // Handle one row at a time.
-    for (; out_y < params.output_height; out_y++) {
-      conv_multirow_func(input_ptr, 0, out_y, filter_data, bias_data,
+    for (; out_y < end_y; out_y++) {
+      conv_multirow_func(input_ptr, out_x, end_x, filter_data, bias_data,
                          output_ptr, params, one_row_shuffle_params,
                          shuffle_workspace);
       input_ptr += stride_height * params.input_row_size;
-- 
GitLab


From 5e0b2f2b0d0d938152334ae1ef1c9b25d229e280 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 1 Jun 2018 15:32:16 -0700
Subject: [PATCH 189/610] [XLA] Move xla/tools/parser/* into xla/service.

Now that we're using the parser inside of xla/service, it's awkward for
it to live inside of xla/tools, because everything else in there is a
standalone tool.  We've already had one person be confused by this.

PiperOrigin-RevId: 198935921
---
 tensorflow/compiler/xla/service/BUILD         |  95 +++++--
 .../xla/service/buffer_assignment_test.cc     |   4 +-
 tensorflow/compiler/xla/service/cpu/BUILD     |   6 +-
 .../cpu/cpu_eigen_tensor_alignment_test.cc    |   6 +-
 .../cpu/cpu_instruction_fusion_test.cc        |  10 +-
 .../xla/service/cpu/ir_emission_utils_test.cc |   4 +-
 .../compiler/xla/service/cpu/tests/BUILD      |   4 +-
 .../cpu/tests/cpu_literal_caching_test.cc     |   6 +-
 .../xla/service/cpu/tests/cpu_outfeed_test.cc |   4 +-
 .../xla/service/elemental_ir_emitter_test.cc  |   4 +-
 .../README.md => service/g3doc/hlo_parser.md} |   0
 .../xla/service/gather_expander_test.cc       |   6 +-
 tensorflow/compiler/xla/service/gpu/BUILD     |   4 +-
 .../xla/service/gpu/fusion_merger_test.cc     |  12 +-
 .../service/gpu/instruction_fusion_test.cc    |  32 +--
 .../xla/service/gpu/while_transformer.cc      |   4 +-
 .../compiler/xla/service/hlo_cse_test.cc      |   4 +-
 .../compiler/xla/service/hlo_domain_test.cc   |   4 +-
 .../xla/service/hlo_execution_profile_test.cc |   4 +-
 .../xla/service/hlo_instruction_test.cc       |   4 +-
 .../{tools/parser => service}/hlo_lexer.cc    |  26 +-
 .../xla/{tools/parser => service}/hlo_lexer.h |  17 +-
 .../xla/service/hlo_liveness_analysis_test.cc |  22 +-
 .../compiler/xla/service/hlo_matchers.h       |   4 +-
 .../compiler/xla/service/hlo_matchers_test.cc |   3 +-
 .../xla/service/hlo_module_dce_test.cc        |  14 +-
 .../compiler/xla/service/hlo_ordering_test.cc |   6 +-
 .../{tools/parser => service}/hlo_parser.cc   | 252 ++++++++++--------
 .../{tools/parser => service}/hlo_parser.h    |  24 +-
 .../parser => service}/hlo_parser_test.cc     |  90 +++----
 tensorflow/compiler/xla/service/hlo_runner.cc |   6 +-
 .../xla/service/hlo_scheduling_test.cc        |   4 +-
 .../compiler/xla/service/hlo_sharding_test.cc |   6 +-
 .../xla/{tools/parser => service}/hlo_token.h |  11 +-
 .../xla/service/instruction_fusion_test.cc    |  20 +-
 .../xla/service/layout_assignment_test.cc     |   6 +-
 .../xla/service/pattern_matcher_test.cc       |   6 +-
 .../xla/service/transpose_folding_test.cc     |  12 +-
 .../compiler/xla/service/tuple_util_test.cc   |   4 +-
 .../while_loop_constant_sinking_test.cc       |  10 +-
 .../while_loop_invariant_code_motion_test.cc  |   2 +-
 .../compiler/xla/service/while_util_test.cc   |   8 +-
 tensorflow/compiler/xla/tests/BUILD           |  10 +-
 .../xla/tests/cross_replica_sum_test.cc       |  11 +-
 .../xla/tests/gather_operation_test.cc        |   4 +-
 .../compiler/xla/tests/hlo_test_base.cc       |   2 +-
 .../xla/tests/hlo_verified_test_base.cc       |   4 +-
 .../compiler/xla/tests/reduce_hlo_test.cc     |   4 +-
 tensorflow/compiler/xla/tools/parser/BUILD    |  73 -----
 49 files changed, 442 insertions(+), 436 deletions(-)
 rename tensorflow/compiler/xla/{tools/parser/README.md => service/g3doc/hlo_parser.md} (100%)
 rename tensorflow/compiler/xla/{tools/parser => service}/hlo_lexer.cc (95%)
 rename tensorflow/compiler/xla/{tools/parser => service}/hlo_lexer.h (90%)
 rename tensorflow/compiler/xla/{tools/parser => service}/hlo_parser.cc (92%)
 rename tensorflow/compiler/xla/{tools/parser => service}/hlo_parser.h (70%)
 rename tensorflow/compiler/xla/{tools/parser => service}/hlo_parser_test.cc (94%)
 rename tensorflow/compiler/xla/{tools/parser => service}/hlo_token.h (84%)
 delete mode 100644 tensorflow/compiler/xla/tools/parser/BUILD

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 2b14b63ea8..0102e4f003 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -349,8 +349,8 @@ tf_cc_test(
         ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:test",
     ],
 )
@@ -388,8 +388,8 @@ cc_library(
     deps = [
         ":hlo",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -399,6 +399,7 @@ tf_cc_test(
     srcs = ["hlo_matchers_test.cc"],
     deps = [
         ":hlo_matchers",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -420,6 +421,7 @@ tf_cc_test(
     srcs = ["hlo_instruction_test.cc"],
     deps = [
         ":hlo",
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -429,7 +431,6 @@ tf_cc_test(
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -444,9 +445,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -989,9 +990,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -1027,9 +1028,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1130,9 +1131,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1165,9 +1166,9 @@ tf_cc_test(
     deps = [
         ":hlo_matchers",
         ":instruction_fusion",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1339,9 +1340,9 @@ tf_cc_test(
     deps = [
         ":gather_expander",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1691,9 +1692,9 @@ tf_cc_test(
         ":cpu_plugin",
         ":hlo_cost_analysis",
         ":hlo_execution_profile",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -1874,9 +1875,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -2211,11 +2212,11 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -2237,9 +2238,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -2310,10 +2311,10 @@ tf_cc_test(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -2415,12 +2416,12 @@ tf_cc_test(
         ":hlo",
         ":hlo_domain_isolator",
         ":hlo_domain_remover",
+        ":hlo_parser",
         ":hlo_sharding_metadata",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/legacy_flags:debug_options_flags",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:test",
     ],
 )
@@ -2506,10 +2507,10 @@ xla_test(
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -2655,10 +2656,10 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/gpu:ir_emission_utils",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -2795,7 +2796,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:compiler",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
@@ -2831,8 +2832,8 @@ tf_cc_test(
         ":tuple_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -2857,8 +2858,8 @@ tf_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -2884,8 +2885,8 @@ tf_cc_test(
         ":hlo_matchers",
         ":while_loop_invariant_code_motion",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:test",
     ],
 )
@@ -2911,8 +2912,8 @@ tf_cc_test(
         ":hlo_matchers",
         ":while_loop_constant_sinking",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:test",
     ],
 )
@@ -2965,9 +2966,57 @@ tf_cc_test(
         ":hlo_matchers",
         ":indexed_array_analysis",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_verified_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:test",
     ],
 )
+
+cc_library(
+    name = "hlo_parser",
+    srcs = ["hlo_parser.cc"],
+    hdrs = ["hlo_parser.h"],
+    deps = [
+        ":hlo",
+        ":hlo_lexer",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_parser_test",
+    size = "small",
+    srcs = ["hlo_parser_test.cc"],
+    deps = [
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:window_util",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",  # fixdeps: keep
+    ],
+)
+
+cc_library(
+    name = "hlo_lexer",
+    srcs = ["hlo_lexer.cc"],
+    hdrs = [
+        "hlo_lexer.h",
+        "hlo_token.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:regexp_internal",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index bdcea92882..7e86c33687 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -32,12 +32,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
@@ -1793,7 +1793,7 @@ ENTRY %test_module {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
 
   // Run CopyInsertion and check if the graph constructed above doesn't need
   // any copies inserted for BufferAssignment to run.
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index a15e41fee0..f10d71fdba 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -633,10 +633,10 @@ tf_cc_test(
     deps = [
         ":cpu_instruction_fusion",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:transpose_folding",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
     ],
 )
@@ -690,9 +690,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -942,7 +942,7 @@ tf_cc_test(
         ":ir_emission_utils",
         ":target_machine_features_fake",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
index d12fa6bb9a..8727c72b6e 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_eigen_tensor_alignment_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/dot_op_emitter.h"
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace cpu {
@@ -40,7 +40,7 @@ ENTRY DotOperation {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   HloInstruction* dot = module->entry_computation()->root_instruction();
 
@@ -71,7 +71,7 @@ ENTRY ConvOperation {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   HloInstruction* conv = module->entry_computation()->root_instruction();
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index 46fe060817..97e10a89a2 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <set>
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transpose_folding.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -172,7 +172,7 @@ ENTRY DotOperationFusion_TransposeFusion {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
   HloComputation* computation = module->entry_computation();
 
   TransposeFolding transpose_folding(
@@ -202,7 +202,7 @@ ENTRY DotOperationFusion_TransposeFusion {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
   HloComputation* computation = module->entry_computation();
 
   TransposeFolding transpose_folding(
@@ -233,7 +233,7 @@ ENTRY DotOperationFusion_TransposeFusion {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
   HloComputation* computation = module->entry_computation();
 
   TransposeFolding transpose_folding(
@@ -775,7 +775,7 @@ TEST_P(GatherLoopFusionTest, GatherLoopFusion) {
   string hlo_string = tensorflow::strings::StrCat(
       "HloModule ", spec.test_name, "\n\n", spec.hlo_computation_text);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   RunFusionAndCheckOpcodesWereFused(
       module.get(),
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
index abb2471e6a..530ebce854 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emission_utils_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/ir_emission_utils.h"
 
 #include "tensorflow/compiler/xla/service/cpu/target_machine_features_fake.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -35,7 +35,7 @@ ENTRY Conv {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   HloComputation* entry_computation = module->entry_computation();
 
diff --git a/tensorflow/compiler/xla/service/cpu/tests/BUILD b/tensorflow/compiler/xla/service/cpu/tests/BUILD
index 67f776e7b5..66ae5ef0f6 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/tests/BUILD
@@ -152,9 +152,9 @@ tf_cc_test(
     srcs = ["cpu_literal_caching_test.cc"],
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
@@ -166,9 +166,9 @@ tf_cc_test(
     srcs = ["cpu_outfeed_test.cc"],
     deps = [
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service/cpu:cpu_compiler",
         "//tensorflow/compiler/xla/service/cpu/tests:cpu_codegen_test",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 3cb25c5c19..27044b1d62 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 namespace xla {
 namespace cpu {
@@ -60,7 +60,7 @@ CHECK-NOT: private constant [12 x float]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
@@ -105,7 +105,7 @@ CHECK-NOT: private constant [2 x float]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
index 1a948fb4fe..1ee279290b 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_outfeed_test.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/cpu/cpu_compiler.h"
 #include "tensorflow/compiler/xla/service/cpu/tests/cpu_codegen_test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 namespace xla {
 namespace cpu {
@@ -41,7 +41,7 @@ CHECK: private constant [12 x float]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
index b43dc0c65d..8980d43033 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -33,7 +33,7 @@ class ElementalIrEmitterExecutionTest : public HloTestBase {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            tools::Parse(hlo_text, config));
+                            ParseHloString(hlo_text, config));
     EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), args, nullopt));
   }
 };
diff --git a/tensorflow/compiler/xla/tools/parser/README.md b/tensorflow/compiler/xla/service/g3doc/hlo_parser.md
similarity index 100%
rename from tensorflow/compiler/xla/tools/parser/README.md
rename to tensorflow/compiler/xla/service/g3doc/hlo_parser.md
diff --git a/tensorflow/compiler/xla/service/gather_expander_test.cc b/tensorflow/compiler/xla/service/gather_expander_test.cc
index 1c72ca0665..020ffcd106 100644
--- a/tensorflow/compiler/xla/service/gather_expander_test.cc
+++ b/tensorflow/compiler/xla/service/gather_expander_test.cc
@@ -14,9 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/gather_expander.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -36,7 +36,7 @@ ENTRY main {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
 
   Status status = GatherExpander{}.Run(module.get()).status();
   EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
@@ -63,7 +63,7 @@ ENTRY main {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_text));
+                          ParseHloString(hlo_text));
   TF_ASSERT_OK_AND_ASSIGN(bool changed, GatherExpander{}.Run(module.get()));
   ASSERT_TRUE(changed);
 
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 68297ad4ae..6bd9d4c31d 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -416,9 +416,9 @@ tf_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/service:hlo",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -460,9 +460,9 @@ tf_cc_test(
         ":instruction_fusion",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
index 2217776c7d..b22bb1d39b 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/instruction_fusion.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace gpu {
@@ -40,7 +40,7 @@ class FusionMergerTest : public HloTestBase {};
 //                   Tuple
 //
 TEST_F(FusionMergerTest, MergeSharedFusionInstruction) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule MergeSharedFusionInstruction
 
 comp.3 {
@@ -104,7 +104,7 @@ ENTRY MergeSharedFusionInstruction.Computation0 {
 //
 // Fusion2 is not merged because it exceeds the threshold flops-to-bytes ratio.
 TEST_F(FusionMergerTest, FlopsToBytesRatioThresholdExceeded) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule FlopsToBytesRatioThresholdExceeded
 
 comp.2 {
@@ -162,7 +162,7 @@ ENTRY FlopsToBytesRatioThresholdExceeded.Computation1 {
 // is merged into Fusion0 and Fusion1) would exceed the bytes transferred
 // threshold.
 TEST_F(FusionMergerTest, BytesTransferredThresholdExeceeded) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule BytesTransferredThresholdExeceeded
 
 comp.2 {
@@ -210,7 +210,7 @@ ENTRY BytesTransferredThresholdExeceeded.Computation2 {
 // Fusion2 is reduced for this test which makes the merge operation into its
 // operand below the bytes transferred threshold.
 TEST_F(FusionMergerTest, BytesTransferredThresholdNotExeceeded) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
 HloModule BytesTransferredThresholdNotExeceeded
 
 comp.2 {
@@ -253,7 +253,7 @@ ENTRY BytesTransferredThresholdNotExeceeded.Computation2 {
 // Check that we're willing to merge f1_computation into f2_computation, even
 // though f2 is an input fusion node.
 TEST_F(FusionMergerTest, WillMergeIntoInputFusion) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule m
 
     f1_computation {
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index ec60f3a167..426b1d235c 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -17,9 +17,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -143,7 +143,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
 
 // Tests that broadcasts fused into a fusion with a reduce root.
 TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
 
     add {
@@ -172,7 +172,7 @@ TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
 }
 
 TEST_F(InstructionFusionTest, BitcastIntoAdd) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
 
     ENTRY BroadcastIntoAdd {
@@ -194,7 +194,7 @@ TEST_F(InstructionFusionTest, BitcastIntoAdd) {
 }
 
 TEST_F(InstructionFusionTest, AddIntoBitcast) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
 
     ENTRY BroadcastIntoAdd {
@@ -216,7 +216,7 @@ TEST_F(InstructionFusionTest, AddIntoBitcast) {
 }
 
 TEST_F(InstructionFusionTest, DontFuseGTE) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY DontFuseGTE {
     p0 = (f32[10], f32[10]) parameter(0)
@@ -232,7 +232,7 @@ TEST_F(InstructionFusionTest, DontFuseGTE) {
 }
 
 TEST_F(InstructionFusionTest, DotOutputFusion) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     alpha = f32[] constant(3)
@@ -261,7 +261,7 @@ TEST_F(InstructionFusionTest, DotOutputFusion) {
 // Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
 // duplicated and fused into both reduces.
 TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   Add {
     lhs = f32[] parameter(0)
@@ -292,7 +292,7 @@ TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
 // is *not* duplicated and fused into both reduces, because we say that integer
 // division is not cheap.
 TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   Add {
     lhs = s32[] parameter(0)
@@ -317,7 +317,7 @@ TEST_F(InstructionFusionTest, IntegerDivIsNotCheap) {
 }
 
 TEST_F(InstructionFusionTest, DotOutputFusionImpossible) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY NoOutputFusion {
     alpha = f32[] constant(3)
@@ -371,7 +371,7 @@ static StatusOr<const HloInstruction*> FindHloInstruction(
 TEST_F(InstructionFusionTest, MultiOutputFusion) {
   // sub --> add --> tuple
   //  \---------------/
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
     ENTRY OutputFusion {
      p0 = f32[4,3]{1,0} parameter(0)
@@ -403,7 +403,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion) {
 TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) {
   // tanh --> add --> tuple
   //  \---------------/
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
     ENTRY OutputFusion {
      p0 = f32[4,3]{1,0} parameter(0)
@@ -424,7 +424,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusionExpensiveOp) {
 TEST_F(InstructionFusionTest, MultiOutputFusion2) {
   // sub --> add1 --\--------\
   //  \----------> add2 --> tuple
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
     ENTRY OutputFusion {
      p0 = f32[4,3]{1,0} parameter(0)
@@ -457,7 +457,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion2) {
 TEST_F(InstructionFusionTest, MultiOutputFusion3) {
   // sub --> add1 ----\--------\
   //  \ --> add2 --> add3 --> tuple
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
     ENTRY OutputFusion {
      p0 = f32[4,3]{1,0} parameter(0)
@@ -492,7 +492,7 @@ TEST_F(InstructionFusionTest, MultiOutputFusion3) {
 TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) {
   // sub --> mul ---\
   //  \--> call --> add --> tuple
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     c = f32[] constant(42)
@@ -527,7 +527,7 @@ TEST_F(InstructionFusionTest, NoCyclesDueToMultiOutputFusion) {
 TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) {
   // sub[2,3] --> add[4,3] --> tuple([2,3], [4,3])
   //  \-------------------------/
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule test_module
     ENTRY OutputFusion {
      p0 = f32[2,3]{1,0} parameter(0)
@@ -548,7 +548,7 @@ TEST_F(InstructionFusionTest, NoMultiOutputFusionWithIncompatibleShapes) {
 }
 
 TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
 
   add_computation {
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer.cc b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
index ad55728c45..7749201cbc 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer.cc
@@ -457,8 +457,8 @@ class WhileBodyComputationMatcher : public MatcherBase {
         return InvalidArgument("Unexpected tuple index instruction : %s",
                                inst->name().c_str());
       } else if (tag == "loop_increment") {
-        // Parse the constant which represents the loop induction variable
-        // increment value.
+        // ParseHloString the constant which represents the loop induction
+        // variable increment value.
         TF_RETURN_IF_ERROR(ParseConstInteger(inst, &loop_increment_));
       } else if (tag == "param0" &&
                  inst != computation_->parameter_instruction(0)) {
diff --git a/tensorflow/compiler/xla/service/hlo_cse_test.cc b/tensorflow/compiler/xla/service/hlo_cse_test.cc
index e8c5ca347b..16db374566 100644
--- a/tensorflow/compiler/xla/service/hlo_cse_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse_test.cc
@@ -32,10 +32,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -486,7 +486,7 @@ TEST_F(HloCseTest, DoNotCombineCallsToImpureFunctions) {
 }
 
 TEST_F(HloCseTest, CompareComputations) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
     HloModule m
 
     add_computation {
diff --git a/tensorflow/compiler/xla/service/hlo_domain_test.cc b/tensorflow/compiler/xla/service/hlo_domain_test.cc
index f29aac29c0..5553ddb153 100644
--- a/tensorflow/compiler/xla/service/hlo_domain_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_domain_test.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_domain_isolator.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_domain_remover.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -68,7 +68,7 @@ class HloDomainTest : public HloTestBase {
       tensorflow::StringPiece hlo_string) {
     HloModuleConfig config;
     config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    return tools::Parse(hlo_string, config);
+    return ParseHloString(hlo_string, config);
   }
 };
 
diff --git a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
index 4900c813fd..eba80c0f19 100644
--- a/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_execution_profile_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace xla {
@@ -29,7 +29,7 @@ using ::testing::ContainsRegex;
 class HloExecutionProfileTest : public HloTestBase {};
 
 TEST_F(HloExecutionProfileTest, Basic) {
-  auto hlo_module = tools::Parse(R"(
+  auto hlo_module = ParseHloString(R"(
   HloModule test_module
   ENTRY entry_computation {
     lhs = f32[30,30]{1,0} parameter(0)
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index a1a8814384..313033ddad 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 
@@ -1533,7 +1533,7 @@ ENTRY entry (param: s32[]) -> s32[] {
   // Check that deep clones really deep clones every instruction and
   // computations, without leaving dangling pointers to the old module.
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
   std::unique_ptr<HloModule> clone = module->Clone();
   for (HloComputation* computation : clone->computations()) {
     EXPECT_EQ(computation->parent(), clone.get());
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
similarity index 95%
rename from tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
rename to tensorflow/compiler/xla/service/hlo_lexer.cc
index 350db12653..f0d9fdbc8f 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_lexer.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 
 #include <unordered_map>
 
@@ -26,9 +26,8 @@ limitations under the License.
 #include "tensorflow/core/platform/regexp.h"
 
 namespace xla {
-namespace tools {
 
-using tensorflow::StringPiece;
+using ::tensorflow::StringPiece;
 
 namespace {
 
@@ -67,12 +66,12 @@ bool HloLexer::CanDereference(const char* ptr) const {
   return ptr < buf_.end() && ptr >= buf_.begin();
 }
 
-StringPiece HloLexer::StringPieceFromPointers(const char* begin,
-                                              const char* end) const {
+tensorflow::StringPiece HloLexer::StringPieceFromPointers(
+    const char* begin, const char* end) const {
   CHECK(begin <= end);
   CHECK(begin == buf_.end() || CanDereference(begin));
   CHECK(end == buf_.end() || CanDereference(end));
-  return StringPiece(begin, end - begin);
+  return tensorflow::StringPiece(begin, end - begin);
 }
 
 tensorflow::RegexpStringPiece HloLexer::RegexpStringPieceFromPointers(
@@ -197,7 +196,8 @@ TokKind HloLexer::LexIdentifier() {
     return TokKind::kAttributeName;
   }
 
-  StringPiece identifier = StringPieceFromPointers(token_start_, current_ptr_);
+  tensorflow::StringPiece identifier =
+      StringPieceFromPointers(token_start_, current_ptr_);
 
   // See if this is a keyword.
 #define KEYWORD(STR)            \
@@ -332,23 +332,24 @@ std::pair<unsigned, unsigned> HloLexer::GetLineAndColumn(LocTy location) const {
   line_no_cache_.last_query = ptr;
   line_no_cache_.line_no_of_query = line_no;
   size_t line_offset = StringPieceFromPointers(start, ptr).rfind('\n');
-  if (line_offset == StringPiece::npos) {
+  if (line_offset == tensorflow::StringPiece::npos) {
     line_offset = 0;
   }
   return {line_no, ptr - start - line_offset};
 }
 
-StringPiece HloLexer::GetLine(LocTy loc) const {
+tensorflow::StringPiece HloLexer::GetLine(LocTy loc) const {
   if (!CanDereference(loc)) {
     return "LINE OUT OF RANGE";
   }
   size_t line_start =
       StringPieceFromPointers(buf_.begin(), loc + 1).rfind('\n');
-  const char* start = line_start == StringPiece::npos
+  const char* start = line_start == tensorflow::StringPiece::npos
                           ? buf_.begin()
                           : buf_.begin() + line_start + 1;
   size_t line_end = StringPieceFromPointers(loc, buf_.end()).find('\n');
-  const char* end = line_end == StringPiece::npos ? buf_.end() : loc + line_end;
+  const char* end =
+      line_end == tensorflow::StringPiece::npos ? buf_.end() : loc + line_end;
 
   return StringPieceFromPointers(start, end);
 }
@@ -370,7 +371,7 @@ TokKind HloLexer::LexString() {
   static LazyRE2 escaping_pattern = {R"("([^"\\]|\\.)*")"};
   if (RE2::Consume(&consumable, *escaping_pattern)) {
     current_ptr_ = consumable.begin();
-    StringPiece raw =
+    tensorflow::StringPiece raw =
         StringPieceFromPointers(token_start_ + 1, current_ptr_ - 1);
     string error;
     if (!tensorflow::str_util::CUnescape(raw, &str_val_, &error)) {
@@ -453,5 +454,4 @@ string TokKindToString(TokKind kind) {
   }
 }
 
-}  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
similarity index 90%
rename from tensorflow/compiler/xla/tools/parser/hlo_lexer.h
rename to tensorflow/compiler/xla/service/hlo_lexer.h
index 27880b9b8a..ceb674f25e 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -13,12 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_
-#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
 
 #include <string>
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_token.h"
+#include "tensorflow/compiler/xla/service/hlo_token.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -27,9 +27,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
-namespace tools {
 
 // Lexer for the HloModule::ToString() format text.
+//
+// This class is meant to be used by hlo_parser.cc.  You shouldn't need to use
+// it directly.
 class HloLexer {
  public:
   explicit HloLexer(tensorflow::StringPiece buf) : buf_(buf) {
@@ -57,7 +59,7 @@ class HloLexer {
     CHECK(GetKind() == TokKind::kShape);
     return shape_val_;
   }
-  int64 GetInt64Val() const {
+  tensorflow::int64 GetInt64Val() const {
     CHECK(GetKind() == TokKind::kInt);
     return int64_val_;
   }
@@ -114,7 +116,7 @@ class HloLexer {
   TokKind current_kind_;
   string str_val_;
   Shape shape_val_;
-  int64 int64_val_;
+  tensorflow::int64 int64_val_;
   double decimal_val_;
 
   struct LineNoCacheTy {
@@ -125,7 +127,6 @@ class HloLexer {
   mutable LineNoCacheTy line_no_cache_{nullptr, 0};
 };
 
-}  // namespace tools
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_LEXER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
index 8e2e2c7627..0275294a1a 100644
--- a/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_liveness_analysis_test.cc
@@ -18,12 +18,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -59,7 +59,7 @@ class HloLivenessAnalysisTest : public HloTestBase {
 
 // Test that add instruction at entry root is live at all output shape indices.
 TEST_F(HloLivenessAnalysisTest, AddAtEntryRoot) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(0)
@@ -75,7 +75,7 @@ TEST_F(HloLivenessAnalysisTest, AddAtEntryRoot) {
 
 // Test that a dead add instruction is marked as dead by analysis.
 TEST_F(HloLivenessAnalysisTest, DeadAdd) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(0)
@@ -94,7 +94,7 @@ TEST_F(HloLivenessAnalysisTest, DeadAdd) {
 // Test that all output shape indices of entry root tuple (and defining
 // instruction in its output) are marked live.
 TEST_F(HloLivenessAnalysisTest, TupleAtEntryRoot) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(0)
@@ -113,7 +113,7 @@ TEST_F(HloLivenessAnalysisTest, TupleAtEntryRoot) {
 // Tests that all outputs of nested tuple and entry root (and defining
 // instruction values appearing in its output) are marked live.
 TEST_F(HloLivenessAnalysisTest, NestedTupleAtEntryRoot) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(1)
@@ -140,7 +140,7 @@ TEST_F(HloLivenessAnalysisTest, NestedTupleAtEntryRoot) {
 // Tests that GTE at entry root of Tuple instruction only propgates liveness
 // to the live elements in tuple.
 TEST_F(HloLivenessAnalysisTest, GteOfTuple) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(0)
@@ -162,7 +162,7 @@ TEST_F(HloLivenessAnalysisTest, GteOfTuple) {
 // Tests that GTE at entry root of nested Tuple instruction only propgates
 // liveness to the live elements in tuple.
 TEST_F(HloLivenessAnalysisTest, GteOfNestedTuple) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(0)
@@ -199,7 +199,7 @@ TEST_F(HloLivenessAnalysisTest, GteOfNestedTuple) {
 // Tests that GTE of GTE (at entry root) of nested Tuple instruction only
 // propgates liveness to the live elements in tuple.
 TEST_F(HloLivenessAnalysisTest, GteOfGteOfNestedTuple) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleModule
   ENTRY SimpleComputation {
     constant.1 = s32[] constant(0)
@@ -240,7 +240,7 @@ TEST_F(HloLivenessAnalysisTest, GteOfGteOfNestedTuple) {
 
 // Test that live/dead while tuple elements are marked live/dead correctly.
 TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], s32[3]{0}) parameter(0)
@@ -291,7 +291,7 @@ TEST_F(HloLivenessAnalysisTest, WhileWithDeadTupleElement) {
 // Tests that a tuple element live in while.cond computation, propagates
 // liveness to while.body.root/while.result/while.operand (where it is unused).
 TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], s32[3]{0}) parameter(0)
@@ -345,7 +345,7 @@ TEST_F(HloLivenessAnalysisTest, WhileCondPropagatesLiveness) {
 // Tests that a use of while.result{0} propagates liveness to
 // while.body.param{1} to while.body.root{1}, and then to while.body.param{2}.
 TEST_F(HloLivenessAnalysisTest, WhileWithLiveTupleElements) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], s32[], s32[]) parameter(0)
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index dfefad3634..c570b420c2 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_MATCHERS_H_
 
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 
 namespace xla {
@@ -329,7 +329,7 @@ inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
 inline ::testing::Matcher<const ::xla::HloInstruction*> Sharding(
     tensorflow::StringPiece sharding) {
   return ::testing::MakeMatcher(new ::xla::testing::HloShardingMatcher(
-      xla::tools::ParseSharding(sharding).ValueOrDie()));
+      ParseSharding(sharding).ValueOrDie()));
 }
 // Verifies that no HloSharding is set for an HLO instruction.
 inline ::testing::Matcher<const ::xla::HloInstruction*> NoSharding() {
diff --git a/tensorflow/compiler/xla/service/hlo_matchers_test.cc b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
index 1d10e3c4fe..9a3010cf1f 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace op = xla::testing::opcode_matchers;
@@ -194,7 +195,7 @@ ENTRY DotOperationFusion_TransposeFusion {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
   HloInstruction* root = module->entry_computation()->root_instruction();
 
   EXPECT_THAT(root, op::Dot(op::Parameter(0), op::Parameter(1),
diff --git a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
index 53b7d0ed39..363862e490 100644
--- a/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_dce_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/types.h"
@@ -73,7 +73,7 @@ class HloModuleDceTest : public HloTestBase {
 
 // Tests that a while with all outputs live is unmodified.
 TEST_F(HloModuleDceTest, WhileWithLiveOutputs) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], s32[3]{0}) parameter(0)
@@ -110,7 +110,7 @@ TEST_F(HloModuleDceTest, WhileWithLiveOutputs) {
 // Tests a while loop with one unused output (which is used in the while loop
 // body by an instruction with side-effects: rng) is unmodified.
 TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], f32[]) parameter(0)
@@ -150,7 +150,7 @@ TEST_F(HloModuleDceTest, WhileWithUnusedSideEffectingTupleElement) {
 // Tests that a while loop with one dead tuple element at {1} has its while
 // loop body modified to make that tuple element pass-through the while body.
 TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], s32[3]{0}) parameter(0)
@@ -193,7 +193,7 @@ TEST_F(HloModuleDceTest, OneWhileWithDeadTupleElement) {
 // dead in while.body{1} and at while.result{1}) propgates liveness of this
 // tuple element to while.body{1} and at while.result{1}.
 TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body {
     loop_var.1 = (s32[], s32[]) parameter(0)
@@ -235,7 +235,7 @@ TEST_F(HloModuleDceTest, OneWhileWithTupleElementUsedByCond) {
 // Tests that HloModuleDCE can remove a dead tuple element at index {1} between
 // two dependent while loops.
 TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body0 {
     loop_var.1 = (s32[], s32[3]{0}) parameter(0)
@@ -303,7 +303,7 @@ TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElement) {
 // Tests that HloModuleDCE can remove a dead tuple element at while.1{0} and
 // while.2{1}, between two dependent while loops.
 TEST_F(HloModuleDceTest, TwoWhilesWithDeadTupleElementSwizzled) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule SimpleLoop
   SimpleLoop.body0 {
     loop_var.1 = (s32[3]{0}, s32[]) parameter(0)
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 37a7fbad97..cfe5dace05 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -22,10 +22,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_dataflow_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_scheduling.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -310,7 +310,7 @@ ENTRY while.v11 {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
   DependencyHloOrdering ordering(module.get());
   ordering.ToString();  // Shouldn't crash.
 }
@@ -347,7 +347,7 @@ ENTRY root {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(auto dataflow,
                           HloDataflowAnalysis::Run(*module, /*ssa_form=*/true));
   DependencyHloOrdering ordering(module.get());
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
similarity index 92%
rename from tensorflow/compiler/xla/tools/parser/hlo_parser.cc
rename to tensorflow/compiler/xla/service/hlo_parser.cc
index ef10ca4bff..cefc6ff915 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -24,18 +24,17 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 
 namespace xla {
-namespace tools {
 
 namespace {
 
-using tensorflow::StringPiece;
-using tensorflow::gtl::optional;
-using tensorflow::str_util::Join;
-using tensorflow::str_util::Split;
-using tensorflow::str_util::SplitAndParseAsInts;
-using tensorflow::strings::Printf;
-using tensorflow::strings::StrAppend;
-using tensorflow::strings::StrCat;
+using ::tensorflow::StringPiece;
+using ::tensorflow::gtl::optional;
+using ::tensorflow::str_util::Join;
+using ::tensorflow::str_util::Split;
+using ::tensorflow::str_util::SplitAndParseAsInts;
+using ::tensorflow::strings::Printf;
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 const double kF16max = 65504;
 
@@ -83,11 +82,15 @@ class HloParser {
 
   // Sets the sub-value of literal at the given index to the given value. The
   // literal's shape must have the default layout.
-  bool SetValueInLiteral(int64 value, int64 linear_index, Literal* literal);
-  bool SetValueInLiteral(double value, int64 linear_index, Literal* literal);
-  bool SetValueInLiteral(bool value, int64 linear_index, Literal* literal);
+  bool SetValueInLiteral(tensorflow::int64 value,
+                         tensorflow::int64 linear_index, Literal* literal);
+  bool SetValueInLiteral(double value, tensorflow::int64 linear_index,
+                         Literal* literal);
+  bool SetValueInLiteral(bool value, tensorflow::int64 linear_index,
+                         Literal* literal);
   template <typename LiteralNativeT, typename ParsedElemT>
-  bool SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
+  bool SetValueInLiteralHelper(ParsedElemT value,
+                               tensorflow::int64 linear_index,
                                Literal* literal);
 
   bool ParseOperands(std::vector<HloInstruction*>* operands);
@@ -99,9 +102,9 @@ class HloParser {
   // Describes the start, limit, and stride on every dimension of the operand
   // being sliced.
   struct SliceRanges {
-    std::vector<int64> starts;
-    std::vector<int64> limits;
-    std::vector<int64> strides;
+    std::vector<tensorflow::int64> starts;
+    std::vector<tensorflow::int64> limits;
+    std::vector<tensorflow::int64> strides;
   };
 
   // Types of attributes.
@@ -179,13 +182,14 @@ class HloParser {
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
 
   // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
-  bool ParseDxD(const string& name, std::vector<int64>* result);
+  bool ParseDxD(const string& name, std::vector<tensorflow::int64>* result);
   // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
-  bool ParseWindowPad(std::vector<std::vector<int64>>* pad);
+  bool ParseWindowPad(std::vector<std::vector<tensorflow::int64>>* pad);
 
   bool ParseSliceRanges(SliceRanges* result);
   bool ParseInt64List(const TokKind start, const TokKind end,
-                      const TokKind delim, std::vector<int64>* result);
+                      const TokKind delim,
+                      std::vector<tensorflow::int64>* result);
 
   bool ParseParamListToShape(Shape* shape, LocTy* shape_loc);
   bool ParseParamList();
@@ -197,7 +201,7 @@ class HloParser {
   bool ParseFftType(FftType* result);
   bool ParseFusionKind(HloInstruction::FusionKind* result);
   bool ParseRandomDistribution(RandomDistribution* result);
-  bool ParseInt64(int64* result);
+  bool ParseInt64(tensorflow::int64* result);
   bool ParseDouble(double* result);
   bool ParseBool(bool* result);
   bool ParseToken(TokKind kind, const string& msg);
@@ -455,7 +459,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
   HloInstruction* instruction;
   switch (opcode) {
     case HloOpcode::kParameter: {
-      int64 parameter_number;
+      tensorflow::int64 parameter_number;
       if (!ParseToken(TokKind::kLparen,
                       "expects '(' before parameter number") ||
           !ParseInt64(&parameter_number) ||
@@ -611,7 +615,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecv: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/0) ||
           !ParseAttributes(attrs)) {
@@ -622,7 +626,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kRecvDone: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -636,7 +640,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSend: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -647,7 +651,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kSendDone: {
-      optional<int64> channel_id;
+      optional<tensorflow::int64> channel_id;
       attrs["channel_id"] = {/*required=*/true, AttrTy::kInt64, &channel_id};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -661,7 +665,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGetTupleElement: {
-      optional<int64> index;
+      optional<tensorflow::int64> index;
       attrs["index"] = {/*required=*/true, AttrTy::kInt64, &index};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
           !ParseAttributes(attrs)) {
@@ -719,7 +723,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kFft: {
       optional<FftType> fft_type;
-      optional<std::vector<int64>> fft_length;
+      optional<std::vector<tensorflow::int64>> fft_length;
       attrs["fft_type"] = {/*required=*/true, AttrTy::kFftType, &fft_type};
       attrs["fft_length"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &fft_length};
@@ -732,7 +736,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kBroadcast: {
-      optional<std::vector<int64>> broadcast_dimensions;
+      optional<std::vector<tensorflow::int64>> broadcast_dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &broadcast_dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -744,7 +748,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kConcatenate: {
-      optional<std::vector<int64>> dimensions;
+      optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs) ||
@@ -770,7 +774,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       optional<HloComputation*> reduce_computation;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &reduce_computation};
-      optional<std::vector<int64>> dimensions_to_reduce;
+      optional<std::vector<tensorflow::int64>> dimensions_to_reduce;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions_to_reduce};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
@@ -783,7 +787,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReverse: {
-      optional<std::vector<int64>> dimensions;
+      optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -827,7 +831,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDynamicSlice: {
-      optional<std::vector<int64>> dynamic_slice_sizes;
+      optional<std::vector<tensorflow::int64>> dynamic_slice_sizes;
       attrs["dynamic_slice_sizes"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &dynamic_slice_sizes};
       if (!ParseOperands(&operands, /*expected_size=*/2) ||
@@ -851,7 +855,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kTranspose: {
-      optional<std::vector<int64>> dimensions;
+      optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
                              &dimensions};
       if (!ParseOperands(&operands, /*expected_size=*/1) ||
@@ -865,7 +869,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormTraining: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
+      optional<tensorflow::int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/3) ||
@@ -881,7 +885,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormInference: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
+      optional<tensorflow::int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -898,7 +902,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kBatchNormGrad: {
       optional<float> epsilon;
       attrs["epsilon"] = {/*required=*/true, AttrTy::kFloat, &epsilon};
-      optional<int64> feature_index;
+      optional<tensorflow::int64> feature_index;
       attrs["feature_index"] = {/*required=*/true, AttrTy::kInt64,
                                 &feature_index};
       if (!ParseOperands(&operands, /*expected_size=*/5) ||
@@ -969,8 +973,8 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kReducePrecision: {
-      optional<int64> exponent_bits;
-      optional<int64> mantissa_bits;
+      optional<tensorflow::int64> exponent_bits;
+      optional<tensorflow::int64> mantissa_bits;
       attrs["exponent_bits"] = {/*required=*/true, AttrTy::kInt64,
                                 &exponent_bits};
       attrs["mantissa_bits"] = {/*required=*/true, AttrTy::kInt64,
@@ -1015,7 +1019,7 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     }
     case HloOpcode::kHostCompute: {
       optional<string> channel_name;
-      optional<int64> cost_estimate_ns;
+      optional<tensorflow::int64> cost_estimate_ns;
       attrs["channel_name"] = {/*required=*/true, AttrTy::kString,
                                &channel_name};
       attrs["cost_estimate_ns"] = {/*required=*/true, AttrTy::kInt64,
@@ -1028,16 +1032,16 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kDot: {
-      optional<std::vector<int64>> lhs_contracting_dims;
+      optional<std::vector<tensorflow::int64>> lhs_contracting_dims;
       attrs["lhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &lhs_contracting_dims};
-      optional<std::vector<int64>> rhs_contracting_dims;
+      optional<std::vector<tensorflow::int64>> rhs_contracting_dims;
       attrs["rhs_contracting_dims"] = {
           /*required=*/false, AttrTy::kBracedInt64List, &rhs_contracting_dims};
-      optional<std::vector<int64>> lhs_batch_dims;
+      optional<std::vector<tensorflow::int64>> lhs_batch_dims;
       attrs["lhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &lhs_batch_dims};
-      optional<std::vector<int64>> rhs_batch_dims;
+      optional<std::vector<tensorflow::int64>> rhs_batch_dims;
       attrs["rhs_batch_dims"] = {/*required=*/false, AttrTy::kBracedInt64List,
                                  &rhs_batch_dims};
 
@@ -1069,20 +1073,20 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kGather: {
-      optional<std::vector<int64>> output_window_dims;
+      optional<std::vector<tensorflow::int64>> output_window_dims;
       attrs["output_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &output_window_dims};
-      optional<std::vector<int64>> elided_window_dims;
+      optional<std::vector<tensorflow::int64>> elided_window_dims;
       attrs["elided_window_dims"] = {
           /*required=*/true, AttrTy::kBracedInt64List, &elided_window_dims};
-      optional<std::vector<int64>> gather_dims_to_operand_dims;
+      optional<std::vector<tensorflow::int64>> gather_dims_to_operand_dims;
       attrs["gather_dims_to_operand_dims"] = {/*required=*/true,
                                               AttrTy::kBracedInt64List,
                                               &gather_dims_to_operand_dims};
-      optional<int64> index_vector_dim;
+      optional<tensorflow::int64> index_vector_dim;
       attrs["index_vector_dim"] = {/*required=*/true, AttrTy::kInt64,
                                    &index_vector_dim};
-      optional<std::vector<int64>> window_bounds;
+      optional<std::vector<tensorflow::int64>> window_bounds;
       attrs["window_bounds"] = {/*required=*/true, AttrTy::kBracedInt64List,
                                 &window_bounds};
 
@@ -1178,8 +1182,8 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   LocTy loc = lexer_.GetLoc();
   bool maximal = false;
   bool replicated = false;
-  std::vector<int64> devices;
-  std::vector<int64> tile_assignment_dimensions;
+  std::vector<tensorflow::int64> devices;
+  std::vector<tensorflow::int64> tile_assignment_dimensions;
   Shape tile_shape;
   while (lexer_.GetKind() != TokKind::kRbrace) {
     switch (lexer_.GetKind()) {
@@ -1206,7 +1210,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
           }
 
           do {
-            int64 dim;
+            tensorflow::int64 dim;
             if (!ParseInt64(&dim)) {
               return false;
             }
@@ -1218,7 +1222,7 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
             return false;
           }
           do {
-            int64 device;
+            tensorflow::int64 device;
             if (!ParseInt64(&device)) {
               return false;
             }
@@ -1277,10 +1281,10 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
     }
     sharding->set_type(OpSharding::Type::OpSharding_Type_OTHER);
     *sharding->mutable_tile_shape() = tile_shape;
-    for (int64 dim : tile_assignment_dimensions) {
+    for (tensorflow::int64 dim : tile_assignment_dimensions) {
       sharding->add_tile_assignment_dimensions(dim);
     }
-    for (int64 device : devices) {
+    for (tensorflow::int64 device : devices) {
       sharding->add_tile_assignment_devices(device);
     }
   }
@@ -1315,40 +1319,50 @@ bool HloParser::ParseInstructionNames(
                     "expects '}' at the end of instruction name list");
 }
 
-bool HloParser::SetValueInLiteral(int64 value, int64 linear_index,
+bool HloParser::SetValueInLiteral(tensorflow::int64 value,
+                                  tensorflow::int64 linear_index,
                                   Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case S8:
-      return SetValueInLiteralHelper<int8>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int8>(value, linear_index,
+                                                       literal);
     case S16:
-      return SetValueInLiteralHelper<int16>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int16>(value, linear_index,
+                                                        literal);
     case S32:
-      return SetValueInLiteralHelper<int32>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int32>(value, linear_index,
+                                                        literal);
     case S64:
-      return SetValueInLiteralHelper<int64>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::int64>(value, linear_index,
+                                                        literal);
     case U8:
-      return SetValueInLiteralHelper<uint8>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
+                                                        literal);
     case U16:
-      return SetValueInLiteralHelper<uint8>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
+                                                        literal);
     case U32:
-      return SetValueInLiteralHelper<uint32>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint32>(value, linear_index,
+                                                         literal);
     case U64:
-      return SetValueInLiteralHelper<uint64>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::uint64>(value, linear_index,
+                                                         literal);
     default:
       LOG(FATAL) << "unknown integral primitive type "
                  << PrimitiveType_Name(shape.element_type());
   }
 }
 
-bool HloParser::SetValueInLiteral(double value, int64 linear_index,
+bool HloParser::SetValueInLiteral(double value, tensorflow::int64 linear_index,
                                   Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
     case F16:
-      return SetValueInLiteralHelper<half>(value, linear_index, literal);
+      return SetValueInLiteralHelper<Eigen::half>(value, linear_index, literal);
     case BF16:
-      return SetValueInLiteralHelper<bfloat16>(value, linear_index, literal);
+      return SetValueInLiteralHelper<tensorflow::bfloat16>(value, linear_index,
+                                                           literal);
     case F32:
       return SetValueInLiteralHelper<float>(value, linear_index, literal);
     case F64:
@@ -1359,7 +1373,7 @@ bool HloParser::SetValueInLiteral(double value, int64 linear_index,
   }
 }
 
-bool HloParser::SetValueInLiteral(bool value, int64 linear_index,
+bool HloParser::SetValueInLiteral(bool value, tensorflow::int64 linear_index,
                                   Literal* literal) {
   const Shape& shape = literal->shape();
   switch (shape.element_type()) {
@@ -1372,7 +1386,8 @@ bool HloParser::SetValueInLiteral(bool value, int64 linear_index,
 }
 
 template <typename LiteralNativeT, typename ParsedElemT>
-bool HloParser::SetValueInLiteralHelper(ParsedElemT value, int64 linear_index,
+bool HloParser::SetValueInLiteralHelper(ParsedElemT value,
+                                        tensorflow::int64 linear_index,
                                         Literal* literal) {
   // Check that linear_index is in range.
   if (linear_index >= ShapeUtil::ElementsIn(literal->shape())) {
@@ -1484,7 +1499,7 @@ bool HloParser::ParseNonTupleLiteral(std::unique_ptr<Literal>* literal,
 
 bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
                                   const Shape& shape) {
-  const int64 rank = ShapeUtil::Rank(shape);
+  const tensorflow::int64 rank = ShapeUtil::Rank(shape);
   if (rank > 1 && !EatShapeAndCheckCompatible(shape)) {
     return false;
   }
@@ -1492,8 +1507,8 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
   // Create a literal with the given shape in default layout.
   *literal = Literal::CreateFromDimensions(shape.element_type(),
                                            AsInt64Slice(shape.dimensions()));
-  int64 nest_level = 0;
-  int64 linear_index = 0;
+  tensorflow::int64 nest_level = 0;
+  tensorflow::int64 linear_index = 0;
   // elems_seen_per_dim[i] is how many elements or sub-arrays we have seen for
   // the dimension i. For example, to parse f32[2,3] {{1, 2, 3}, {4, 5, 6}},
   // when we are parsing the 2nd '{' (right before '1'), we are seeing a
@@ -1501,14 +1516,14 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
   // the first '}' (right after '3'), it means the sub-array ends, and the
   // sub-array is supposed to contain exactly 3 elements, so check if
   // elems_seen_per_dim[1] is 3.
-  std::vector<int64> elems_seen_per_dim(rank);
+  std::vector<tensorflow::int64> elems_seen_per_dim(rank);
   auto get_index_str = [&elems_seen_per_dim](int dim) -> string {
-    std::vector<int64> elems_seen_until_dim(elems_seen_per_dim.begin(),
-                                            elems_seen_per_dim.begin() + dim);
+    std::vector<tensorflow::int64> elems_seen_until_dim(
+        elems_seen_per_dim.begin(), elems_seen_per_dim.begin() + dim);
     return StrCat("[",
                   Join(elems_seen_until_dim, ",",
-                       [](string* out, const int64& num_elems) {
-                         tensorflow::strings::StrAppend(out, num_elems - 1);
+                       [](string* out, const tensorflow::int64& num_elems) {
+                         StrAppend(out, num_elems - 1);
                        }),
                   "]");
   };
@@ -1584,7 +1599,7 @@ bool HloParser::ParseDenseLiteral(std::unique_ptr<Literal>* literal,
           lexer_.Lex();
         } else if (primitive_util::IsIntegralType(shape.element_type())) {
           LocTy loc = lexer_.GetLoc();
-          int64 value;
+          tensorflow::int64 value;
           if (!ParseInt64(&value)) {
             return Error(loc, StrCat("expects integer for primitive type: ",
                                      PrimitiveType_Name(shape.element_type())));
@@ -1624,29 +1639,29 @@ bool HloParser::ParseSparseLiteral(std::unique_ptr<Literal>* literal,
 
   switch (shape.element_type()) {
     case PRED:
-      return ParseSparseLiteralHelper<uint8>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
     case S8:
-      return ParseSparseLiteralHelper<int8>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int8>(literal, shape);
     case S16:
-      return ParseSparseLiteralHelper<int16>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int16>(literal, shape);
     case S32:
-      return ParseSparseLiteralHelper<int32>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int32>(literal, shape);
     case S64:
-      return ParseSparseLiteralHelper<int64>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::int64>(literal, shape);
     case U8:
-      return ParseSparseLiteralHelper<uint8>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint8>(literal, shape);
     case U16:
-      return ParseSparseLiteralHelper<uint16>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint16>(literal, shape);
     case U32:
-      return ParseSparseLiteralHelper<uint32>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint32>(literal, shape);
     case U64:
-      return ParseSparseLiteralHelper<uint64>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::uint64>(literal, shape);
     case F16:
-      return ParseSparseLiteralHelper<half>(literal, shape);
+      return ParseSparseLiteralHelper<Eigen::half>(literal, shape);
     case F32:
       return ParseSparseLiteralHelper<float>(literal, shape);
     case BF16:
-      return ParseSparseLiteralHelper<bfloat16>(literal, shape);
+      return ParseSparseLiteralHelper<tensorflow::bfloat16>(literal, shape);
     case F64:
       return ParseSparseLiteralHelper<double>(literal, shape);
     default:
@@ -1659,9 +1674,9 @@ bool HloParser::ParseSparseLiteral(std::unique_ptr<Literal>* literal,
 template <typename LiteralNativeT>
 bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
                                          const Shape& shape) {
-  std::vector<int64> index;
+  std::vector<tensorflow::int64> index;
 
-  int64 rank = ShapeUtil::Rank(shape);
+  tensorflow::int64 rank = ShapeUtil::Rank(shape);
 
   *literal = MakeUnique<Literal>(shape);
 
@@ -1679,7 +1694,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
     LocTy index_loc = lexer_.GetLoc();
     index.clear();
     if (lexer_.GetKind() == TokKind::kInt) {
-      int64 single_index = lexer_.GetInt64Val();
+      tensorflow::int64 single_index = lexer_.GetInt64Val();
       lexer_.Lex();
       if (rank != 1) {
         return Error(
@@ -1712,7 +1727,7 @@ bool HloParser::ParseSparseLiteralHelper(std::unique_ptr<Literal>* literal,
       value = static_cast<LiteralNativeT>(lexer_.GetKind() == TokKind::kw_true);
       lexer_.Lex();
     } else if (primitive_util::IsIntegralType(shape.element_type())) {
-      int64 value_s64;
+      tensorflow::int64 value_s64;
       if (!ParseInt64(&value_s64)) {
         return Error(value_loc,
                      StrCat("expects integer for primitive type: ",
@@ -1885,23 +1900,24 @@ bool HloParser::ParseAttributeHelper(
     LocTy attr_loc = lexer_.GetLoc();
     switch (attr_type) {
       case AttrTy::kInt64: {
-        int64 result;
+        tensorflow::int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        static_cast<optional<int64>*>(attr_out_ptr)->emplace(result);
+        static_cast<optional<tensorflow::int64>*>(attr_out_ptr)
+            ->emplace(result);
         return true;
       }
       case AttrTy::kInt32: {
-        int64 result;
+        tensorflow::int64 result;
         if (!ParseInt64(&result)) {
           return false;
         }
-        if (result != static_cast<int32>(result)) {
+        if (result != static_cast<tensorflow::int32>(result)) {
           return Error(attr_loc, "value out of range for int32");
         }
-        static_cast<optional<int32>*>(attr_out_ptr)
-            ->emplace(static_cast<int32>(result));
+        static_cast<optional<tensorflow::int32>*>(attr_out_ptr)
+            ->emplace(static_cast<tensorflow::int32>(result));
         return true;
       }
       case AttrTy::kFloat: {
@@ -1977,12 +1993,12 @@ bool HloParser::ParseAttributeHelper(
         return true;
       }
       case AttrTy::kBracedInt64List: {
-        std::vector<int64> result;
+        std::vector<tensorflow::int64> result;
         if (!ParseInt64List(TokKind::kLbrace, TokKind::kRbrace, TokKind::kComma,
                             &result)) {
           return false;
         }
-        static_cast<optional<std::vector<int64>>*>(attr_out_ptr)
+        static_cast<optional<std::vector<tensorflow::int64>>*>(attr_out_ptr)
             ->emplace(result);
         return true;
       }
@@ -2157,7 +2173,7 @@ bool HloParser::ParseConvolutionDimensionNumbers(
                << str;
   }
 
-  const int64 rank = lhs_rhs_out[0].length();
+  const tensorflow::int64 rank = lhs_rhs_out[0].length();
   if (rank != lhs_rhs_out[1].length() || rank != lhs_rhs_out[2].length()) {
     return TokenError(
         "convolution lhs, rhs, and output must have the same rank");
@@ -2271,7 +2287,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
   if (!ParseToken(TokKind::kLbrace, "expects '{' to start ranges")) {
     return false;
   }
-  std::vector<std::vector<int64>> ranges;
+  std::vector<std::vector<tensorflow::int64>> ranges;
   if (lexer_.GetKind() == TokKind::kRbrace) {
     // empty
     return ParseToken(TokKind::kRbrace, "expects '}' to end ranges");
@@ -2305,7 +2321,7 @@ bool HloParser::ParseSliceRanges(SliceRanges* result) {
 //   ::= int64_val (delim int64_val)*
 bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
                                const TokKind delim,
-                               std::vector<int64>* result) {
+                               std::vector<tensorflow::int64>* result) {
   if (!ParseToken(start, StrCat("expects an int64 list starting with ",
                                 TokKindToString(start)))) {
     return false;
@@ -2314,7 +2330,7 @@ bool HloParser::ParseInt64List(const TokKind start, const TokKind end,
     // empty
   } else {
     do {
-      int64 i;
+      tensorflow::int64 i;
       if (!ParseInt64(&i)) {
         return false;
       }
@@ -2431,7 +2447,8 @@ bool HloParser::ParseString(string* result) {
   return true;
 }
 
-bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
+bool HloParser::ParseDxD(const string& name,
+                         std::vector<tensorflow::int64>* result) {
   LocTy loc = lexer_.GetLoc();
   if (!result->empty()) {
     return Error(loc,
@@ -2439,7 +2456,7 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   }
   // 1D
   if (lexer_.GetKind() == TokKind::kInt) {
-    int64 number;
+    tensorflow::int64 number;
     if (!ParseInt64(&number)) {
       return Error(loc, Printf("expects sub-attribute '%s=i'", name.c_str()));
     }
@@ -2459,7 +2476,8 @@ bool HloParser::ParseDxD(const string& name, std::vector<int64>* result) {
   return TokenError("expects token type kInt or kDxD");
 }
 
-bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
+bool HloParser::ParseWindowPad(
+    std::vector<std::vector<tensorflow::int64>>* pad) {
   LocTy loc = lexer_.GetLoc();
   if (!pad->empty()) {
     return Error(loc, "sub-attribute 'pad=' already exists");
@@ -2470,7 +2488,7 @@ bool HloParser::ParseWindowPad(std::vector<std::vector<int64>>* pad) {
   string str = lexer_.GetStrVal();
   std::vector<string> padding_str = Split(str, 'x');
   for (int i = 0; i < padding_str.size(); i++) {
-    std::vector<int64> low_high;
+    std::vector<tensorflow::int64> low_high;
     if (!SplitAndParseAsInts(padding_str[i], '_', &low_high) ||
         low_high.size() != 2) {
       return Error(loc,
@@ -2494,7 +2512,7 @@ bool HloParser::ParsePaddingConfig(PaddingConfig* padding) {
   string str = lexer_.GetStrVal();
   std::vector<string> padding_str = Split(str, 'x');
   for (const auto& padding_dim_str : padding_str) {
-    std::vector<int64> padding_dim;
+    std::vector<tensorflow::int64> padding_dim;
     if (!SplitAndParseAsInts(padding_dim_str, '_', &padding_dim) ||
         (padding_dim.size() != 2 && padding_dim.size() != 3)) {
       return Error(loc,
@@ -2516,7 +2534,7 @@ bool HloParser::ParseMetadata(OpMetadata* metadata) {
   optional<string> op_type;
   optional<string> op_name;
   optional<string> source_file;
-  optional<int32> source_line;
+  optional<tensorflow::int32> source_line;
   attrs["op_type"] = {/*required=*/false, AttrTy::kString, &op_type};
   attrs["op_name"] = {/*required=*/false, AttrTy::kString, &op_name};
   attrs["source_file"] = {/*required=*/false, AttrTy::kString, &source_file};
@@ -2603,7 +2621,7 @@ bool HloParser::ParseRandomDistribution(RandomDistribution* result) {
   return true;
 }
 
-bool HloParser::ParseInt64(int64* result) {
+bool HloParser::ParseInt64(tensorflow::int64* result) {
   VLOG(1) << "ParseInt64";
   if (lexer_.GetKind() != TokKind::kInt) {
     return TokenError("expects integer");
@@ -2726,8 +2744,8 @@ HloParser::ParseConvolutionDimensionNumbersOnly() {
 
 }  // namespace
 
-StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
-                                           const HloModuleConfig& config) {
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str, const HloModuleConfig& config) {
   HloParser parser(str, config);
   if (!parser.Run()) {
     return InvalidArgument("Syntax error:\n%s", parser.GetError().c_str());
@@ -2735,9 +2753,10 @@ StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str,
   return parser.ConsumeHloModule();
 }
 
-StatusOr<std::unique_ptr<HloModule>> Parse(StringPiece str) {
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str) {
   HloModuleConfig config;
-  return Parse(str, config);
+  return ParseHloString(str, config);
 }
 
 StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str) {
@@ -2759,5 +2778,4 @@ StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
   return parser.ParseConvolutionDimensionNumbersOnly();
 }
 
-}  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser.h b/tensorflow/compiler/xla/service/hlo_parser.h
similarity index 70%
rename from tensorflow/compiler/xla/tools/parser/hlo_parser.h
rename to tensorflow/compiler/xla/service/hlo_parser.h
index 902c45cebc..3f3a51215e 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser.h
+++ b/tensorflow/compiler/xla/service/hlo_parser.h
@@ -13,28 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_
-#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
 
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_lexer.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
-namespace tools {
+
+// For details about the syntax accepted by this parser, see
+// g3doc/hlo_parser.md.
 
 // The api of the hlo parser. Given a string in the HloModule::ToString()
 // format, parses the string and creates a HloModule with the given config.
-StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str,
-                                           const HloModuleConfig& config);
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str, const HloModuleConfig& config);
 
 // The api of the hlo parser. Given a string in the HloModule::ToString()
 // format, parses the string and creates a HloModule with default config.
-StatusOr<std::unique_ptr<HloModule>> Parse(tensorflow::StringPiece str);
+StatusOr<std::unique_ptr<HloModule>> ParseHloString(
+    tensorflow::StringPiece str);
 
 // Parses the result of HloSharding::ToString(), e.g. "{replicated}".
 StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
@@ -47,7 +50,10 @@ StatusOr<Window> ParseWindow(tensorflow::StringPiece str);
 StatusOr<ConvolutionDimensionNumbers> ParseConvolutionDimensionNumbers(
     tensorflow::StringPiece str);
 
-}  // namespace tools
+// ParseHloString sharding from str. str is supposed to contain the body of the
+// sharding, i.e. just the rhs of the "sharding={...}" attribute string.
+StatusOr<HloSharding> ParseSharding(tensorflow::StringPiece str);
+
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_PARSER_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_PARSER_H_
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
similarity index 94%
rename from tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
rename to tensorflow/compiler/xla/service/hlo_parser_test.cc
index 3c5957b96a..9a18b4f845 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 #include <string>
 #include "tensorflow/compiler/xla/window_util.h"
@@ -23,10 +23,10 @@ limitations under the License.
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
-namespace tools {
+
 namespace {
 
-using tensorflow::StringPiece;
+using ::tensorflow::StringPiece;
 
 struct TestData {
   string test_name;
@@ -901,12 +901,12 @@ class HloParserTest : public ::testing::Test,
         << "'" << s << "' does not contain '" << expected << "'";
   }
 
-  // Expects "ToString(Parse(string)) == string", that is, parses the string,
-  // asserts that it succeeded, stringifies the parsed module, and checks that
-  // the it equals the original string.
+  // Expects "ToString(ParseHloString(string)) == string", that is, parses the
+  // string, asserts that it succeeded, stringifies the parsed module, and
+  // checks that the it equals the original string.
   void ExpectEqual() {
     const string& original = GetParam().module_string;
-    auto result = Parse(original);
+    auto result = ParseHloString(original);
     TF_ASSERT_OK(result.status());
     EXPECT_EQ(original, result.ValueOrDie()->ToString(
                             HloPrintOptions().set_print_large_constants(true)));
@@ -917,7 +917,7 @@ class HloParserShortTest : public HloParserTest {
  protected:
   void ExpectEqualShort() {
     const string& original = GetParam().module_string;
-    auto result = Parse(original);
+    auto result = ParseHloString(original);
     TF_ASSERT_OK(result.status());
     EXPECT_EQ(original,
               result.ValueOrDie()->ToString(HloPrintOptions::ShortParsable()));
@@ -938,13 +938,13 @@ INSTANTIATE_TEST_CASE_P(HloParserTestSuccessInstantiation, HloParserShortTest,
 
 TEST_F(HloParserTest, Empty) {
   const string original = "";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
 }
 
 TEST_F(HloParserTest, Garbage) {
   const string original = "HloModule thi$ str1ng makes# N0 sen$e @all!*&^%$";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
 }
 
@@ -958,7 +958,7 @@ ENTRY %blabla (x: f32[], y: f32[]) -> f32[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
 }
 
@@ -970,7 +970,7 @@ ENTRY %blabla (x: g32[]) -> g32[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
 }
 
@@ -983,7 +983,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
 }
 
@@ -994,7 +994,7 @@ ENTRY %blabla (x: f32[]) -> pred[] {
   %eq = pred[]{} equal-to(f32[]{} %x, f32[]{} %y)
 }
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
 }
 
@@ -1009,7 +1009,7 @@ ENTRY %SelectScalarS32True.v4 () -> s32[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   TF_EXPECT_OK(result.status());
   // Constant instructions have no name. The string will be parsed successfully
   // but the constant names will not be exactly the same.
@@ -1020,7 +1020,7 @@ TEST_F(HloParserTest, ConfigurationField) {
 ENTRY %configuration_test() -> s32[] {
   %constant = s32[] constant(42), backend_config="foo bar"
 })";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   TF_ASSERT_OK(result.status());
   EXPECT_EQ("foo bar", result.ValueOrDie()
                            ->entry_computation()
@@ -1036,7 +1036,7 @@ ENTRY %some_2 () -> f32[2] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects nested array in rank 1, but sees larger");
@@ -1050,7 +1050,7 @@ ENTRY %some_2x3 () -> f32[2,3] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects nested array in rank 2, but sees 1");
@@ -1064,7 +1064,7 @@ ENTRY %some_2x3x2 () -> f32[2,3,2] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "expects 3 elements in the [0]th element");
@@ -1079,7 +1079,7 @@ ENTRY %ConstantF16Overflow.v4 () -> f16[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   EXPECT_NE(Status::OK(), result.status());
   ExpectHasSubstr(result.status().error_message(),
                   "is out of range for literal's primitive type F16");
@@ -1093,7 +1093,7 @@ ENTRY %ConstantWithExp.v4 () -> f32[] {
 }
 
 )";
-  auto result = Parse(original);
+  auto result = ParseHloString(original);
   TF_EXPECT_OK(result.status());
   // The string will be parsed successfully but the output strings are not
   // exactly the same, because "3e2" is parsed into value 300 and will be
@@ -1111,7 +1111,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 )";
-  TF_EXPECT_OK(Parse(original).status());
+  TF_EXPECT_OK(ParseHloString(original).status());
 }
 
 TEST_F(HloParserTest, InvalidDimLabels) {
@@ -1127,17 +1127,18 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 
 )";
 
+  ExpectHasSubstr(ParseHloString(tensorflow::strings::StrCat(
+                                     prefix, ",dim_labels=00_01_10", suffix))
+                      .status()
+                      .error_message(),
+                  "expects dim labels pattern");
+
   ExpectHasSubstr(
-      Parse(tensorflow::strings::StrCat(prefix, ",dim_labels=00_01_10", suffix))
+      ParseHloString(tensorflow::strings::StrCat(
+                         prefix, ",dim_labels=010_1100->010", suffix))
           .status()
           .error_message(),
-      "expects dim labels pattern");
-
-  ExpectHasSubstr(Parse(tensorflow::strings::StrCat(
-                            prefix, ",dim_labels=010_1100->010", suffix))
-                      .status()
-                      .error_message(),
-                  "must have the same rank");
+      "must have the same rank");
 }
 
 TEST_F(HloParserTest, UnexpectedAttribute) {
@@ -1152,7 +1153,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "unexpected attribute \"calls\"");
 }
 
@@ -1168,7 +1169,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "attribute channel_id is expected but not seen");
 }
 
@@ -1184,7 +1185,7 @@ ENTRY %TwoSendRecvBothWayRecvFist.v3 () -> f32[] {
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "'done' is not defined");
 }
 
@@ -1197,7 +1198,7 @@ ENTRY %slice.v2 (p0: f32[3,3,4,4]) -> f32[3,3,2,4] {
 }
 
 )";
-  TF_EXPECT_OK(Parse(original).status());
+  TF_EXPECT_OK(ParseHloString(original).status());
 }
 
 TEST_F(HloParserTest, PaddingConfigIsNotWindowPad) {
@@ -1211,7 +1212,7 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,1], filter: f32[1,1,1]) -> f32[1,2
 }
 
 )";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "expects padding_low and padding_high separated by '_'");
 }
 
@@ -1223,7 +1224,7 @@ ENTRY %test_comma.v4 () -> f32[] {
 }
 
 )";
-  TF_EXPECT_OK(Parse(original).status());
+  TF_EXPECT_OK(ParseHloString(original).status());
 }
 
 TEST_F(HloParserTest, ComputationShapeDoesNotMatchRootShape) {
@@ -1233,7 +1234,7 @@ ENTRY %CustomCall () -> f32[1] {
   %constant = f32[1]{0} constant({12345})
   ROOT %foo = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar"
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "Shape of computation CustomCall, f32[1], is not compatible "
                   "with that of its root instruction foo, f32[1,2,3]");
 }
@@ -1252,7 +1253,7 @@ ENTRY %Reduce (input: f32[8,16,256]) -> f32[8,16] {
   ROOT reduce = f32[8,16]{0,1} reduce(input, constant), dimensions={2}, to_apply=add_F32.v3
 })";
 
-  auto module = Parse(original);
+  auto module = ParseHloString(original);
   TF_ASSERT_OK(module.status());
   auto program_layout = module.ValueOrDie()->host_entry_computation_layout();
   ASSERT_EQ(program_layout.parameter_count(), 1);
@@ -1275,7 +1276,7 @@ c1 {
 c2 {
   const2 = f32[1]{0} constant({67890})
 })";
-  auto module = Parse(original);
+  auto module = ParseHloString(original);
   TF_ASSERT_OK(module.status());
   EXPECT_EQ(module.ValueOrDie()->entry_computation()->name(), "c2");
 }
@@ -1286,7 +1287,7 @@ ENTRY consts {
   first = f32[1]{0} constant({12345})
   last = f32[1]{0} constant({67890})
 })";
-  auto module = Parse(original);
+  auto module = ParseHloString(original);
   TF_ASSERT_OK(module.status());
   EXPECT_EQ(
       module.ValueOrDie()->entry_computation()->root_instruction()->name(),
@@ -1301,7 +1302,7 @@ ENTRY c1 {
 ENTRY c2 {
   const2 = f32[1]{0} constant({67890})
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "expects only one ENTRY");
 }
 
@@ -1311,7 +1312,7 @@ ENTRY consts {
   ROOT const1 = f32[1]{0} constant({12345})
   ROOT const2 = f32[1]{0} constant({12345})
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   "one computation should have only one ROOT");
 }
 
@@ -1323,7 +1324,7 @@ comp {
 comp {
   const2 = f32[1]{0} constant({67890})
 })";
-  ExpectHasSubstr(Parse(original).status().error_message(),
+  ExpectHasSubstr(ParseHloString(original).status().error_message(),
                   R"(was parsing 2:1: error: computation previously defined here
 comp {
 ^)");
@@ -1346,7 +1347,7 @@ ENTRY entry {
   ROOT call1 = s32[] call(param), to_apply=tcallb
 })";
   ExpectHasSubstr(
-      Parse(original).status().error_message(),
+      ParseHloString(original).status().error_message(),
       "was parsing 8:39: error: instruction does not exist: aparam");
 }
 
@@ -1371,5 +1372,4 @@ TEST_F(HloParserTest, ParseConvolutionDimensionNumbers) {
 }
 
 }  // namespace
-}  // namespace tools
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 31e13da0c0..e1f9d8efd4 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -36,7 +36,7 @@ HloRunner::CreateModuleFromString(const tensorflow::StringPiece hlo_string,
                                   const DebugOptions& debug_options) {
   HloModuleConfig config;
   config.set_debug_options(debug_options);
-  return tools::Parse(hlo_string, config);
+  return ParseHloString(hlo_string, config);
 }
 
 namespace {
@@ -80,7 +80,7 @@ HloRunner::ReadModuleFromHloTextFile(const std::string& filename,
                                                   filename, &hlo_string));
   HloModuleConfig config;
   config.set_debug_options(debug_options);
-  return tools::Parse(hlo_string, config);
+  return ParseHloString(hlo_string, config);
 }
 
 HloRunner::HloRunner(se::Platform* platform) {
diff --git a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
index 0bc930f9ea..db7ef6f0d4 100644
--- a/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_scheduling_test.cc
@@ -22,9 +22,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -158,7 +158,7 @@ ENTRY root {
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(module_str));
+                          ParseHloString(module_str));
 
   auto size_fn = [](const BufferValue& buffer) {
     return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_test.cc b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
index 94d1a3226b..ee7133689b 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_test.cc
@@ -19,11 +19,11 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -311,10 +311,10 @@ TEST_F(HloShardingTest, OstreamTest) {
   EXPECT_EQ(oss.str(), "{f32[3,5,7,11] devices=[1,1,2,2]0,1,2,3}");
 }
 
-TEST_F(HloShardingTest, Parse) {
+TEST_F(HloShardingTest, ParseHloString) {
   auto check = [](const HloSharding& sharding) {
     TF_ASSERT_OK_AND_ASSIGN(auto parsed_sharding,
-                            tools::ParseSharding(sharding.ToString()));
+                            ParseSharding(sharding.ToString()));
     EXPECT_EQ(sharding, parsed_sharding);
   };
   check(HloSharding::Replicate());
diff --git a/tensorflow/compiler/xla/tools/parser/hlo_token.h b/tensorflow/compiler/xla/service/hlo_token.h
similarity index 84%
rename from tensorflow/compiler/xla/tools/parser/hlo_token.h
rename to tensorflow/compiler/xla/service/hlo_token.h
index 7928bee5c2..533429608b 100644
--- a/tensorflow/compiler/xla/tools/parser/hlo_token.h
+++ b/tensorflow/compiler/xla/service/hlo_token.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
-#define TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
 
 #include <string>
 
@@ -22,9 +22,11 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
-namespace tools {
 
 // Defines different kinds of tokens in a hlo module string.
+//
+// You shouldn't need to use this directly unless you're using HloLexer
+// directly, and you probably don't need to do that.  Use hlo_parser instead.
 enum class TokKind {
   // Markers
   kEof,
@@ -72,7 +74,6 @@ enum class TokKind {
 
 string TokKindToString(TokKind kind);
 
-}  // namespace tools
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_PARSER_HLO_TOKEN_H_
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_TOKEN_H_
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index df109df787..21db233899 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/instruction_fusion.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 
@@ -47,7 +47,7 @@ class InstructionFusionForTesting : public InstructionFusion {
 };
 
 TEST_F(InstructionFusionTest, FuseInstructions) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY entry_computation {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -67,7 +67,7 @@ TEST_F(InstructionFusionTest, FuseInstructions) {
 }
 
 TEST_F(InstructionFusionTest, FuseIntoFusionInstruction) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   fused_computation {
     p1 = f32[4,3] parameter(0)
@@ -90,7 +90,7 @@ TEST_F(InstructionFusionTest, FuseIntoFusionInstruction) {
 }
 
 TEST_F(InstructionFusionTest, FuseInstructionsIntoMultiOutput) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY entry_computation {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -195,7 +195,7 @@ static int Count(const HloModule& module, HloOpcode op) {
 }
 
 TEST_F(InstructionFusionTest, FuseCheapNonDuplicatableOps) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -220,7 +220,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
   //
   // p0 -> add -------------------------> sub
   //           \-> abs1 -> rng -> abs2 -/
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -251,7 +251,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
   // p0 -> add -------------------------> sub
   //           \-> abs1 -> log -> abs2 -/
   //                           \-> send
-  module = tools::Parse(R"(
+  module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -282,7 +282,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
   //    \         \-> add2 -/
   //     \-> log -/
   //             \-> send
-  module = tools::Parse(R"(
+  module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -314,7 +314,7 @@ TEST_F(InstructionFusionTest, AvoidDuplicationIfNotAllFusableRecursively) {
   //                       \------> sub1
   //                        log -/
   //                            \-> send
-  module = tools::Parse(R"(
+  module = ParseHloString(R"(
   HloModule test_module
   ENTRY OutputFusion {
     p0 = f32[4,3]{1,0} parameter(0)
@@ -390,7 +390,7 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
 
 TEST_F(InstructionFusionTest,
        WideningConvertsAreAlwaysDuplicableIntoConsumers) {
-  auto module = tools::Parse(R"(
+  auto module = ParseHloString(R"(
   HloModule test_module
   ENTRY Test {
     p0 = f16[100] parameter(0)
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 7508013199..bf0448a676 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -29,13 +29,13 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -651,7 +651,7 @@ TEST_F(LayoutAssignmentTest, TransposeWithinFusionDoesNotCrash) {
     }
   )";
 
-  auto module = tools::Parse(module_str).ValueOrDie();
+  auto module = ParseHloString(module_str).ValueOrDie();
 
   module =
       backend()
@@ -691,7 +691,7 @@ TEST_F(LayoutAssignmentTest, GTEInheritsLayoutFromOperand) {
     }
   )";
 
-  auto module = tools::Parse(module_str).ValueOrDie();
+  auto module = ParseHloString(module_str).ValueOrDie();
   ComputationLayout computation_layout(
       module->entry_computation()->ComputeProgramShape());
   Shape param_shape = ShapeUtil::MakeTupleShape(
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 204e8c9920..fef3c132b0 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -16,7 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
@@ -29,7 +29,7 @@ TEST(PatternMatcherTest, AddOp) {
       ROOT %two_plus_two = f32[] add(f32[] %two, f32[] %two)
     }
   )";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
 
   const HloInstruction* matched_inst;
   HloInstruction* matched_operand;
@@ -182,7 +182,7 @@ TEST(PatternMatcherTest, FusionKind) {
       p0 = f32[] parameter(0)
       ROOT fusion = f32[] fusion(p0), kind=kLoop, calls=fused_computation
     })";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, tools::Parse(kModuleStr));
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
 
   auto* root = hlo_module->entry_computation()->root_instruction();
   EXPECT_TRUE(Match(
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index f73f1227aa..3139801ea3 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -27,12 +27,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
 
@@ -69,7 +69,7 @@ ENTRY entry_computation {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   FoldTranspose(module.get());
 
@@ -91,7 +91,7 @@ ENTRY entry_computation {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   TransposeFolding transpose_folding(
       [](const HloInstruction& dot,
@@ -119,7 +119,7 @@ ENTRY entry_computation {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   TransposeFolding transpose_folding(
       [](const HloInstruction& dot,
@@ -147,7 +147,7 @@ ENTRY entry_computation {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   FoldTranspose(module.get());
 
@@ -205,7 +205,7 @@ ENTRY entry_computation {
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
   FoldTranspose(module.get());
 
   const HloComputation* callee = module->GetComputationWithName("callee");
diff --git a/tensorflow/compiler/xla/service/tuple_util_test.cc b/tensorflow/compiler/xla/service/tuple_util_test.cc
index 754fd8ef16..d33d5bb8f3 100644
--- a/tensorflow/compiler/xla/service/tuple_util_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_util_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -37,7 +37,7 @@ ENTRY entry {
 )";
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      tools::Parse(hlo_string));
+                      ParseHloString(hlo_string));
 
   *entry_computation = module->entry_computation();
   *param0 = (*entry_computation)->parameter_instruction(0);
diff --git a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
index 0d2288d8ea..393e758038 100644
--- a/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_constant_sinking_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_loop_constant_sinking.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
@@ -55,7 +55,7 @@ ENTRY entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           WhileLoopConstantSinking{}.Run(module.get()));
@@ -95,7 +95,7 @@ ENTRY entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           WhileLoopConstantSinking{}.Run(module.get()));
@@ -136,7 +136,7 @@ ENTRY entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           WhileLoopConstantSinking{}.Run(module.get()));
@@ -184,7 +184,7 @@ ENTRY entry {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   TF_ASSERT_OK_AND_ASSIGN(bool changed,
                           WhileLoopConstantSinking{}.Run(module.get()));
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index e1ec12192f..8831c513ee 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/while_util_test.cc b/tensorflow/compiler/xla/service/while_util_test.cc
index bcc545c61d..d79d329721 100644
--- a/tensorflow/compiler/xla/service/while_util_test.cc
+++ b/tensorflow/compiler/xla/service/while_util_test.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/while_util.h"
 
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -50,7 +50,7 @@ ENTRY entry {
 )";
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      tools::Parse(hlo_string));
+                      ParseHloString(hlo_string));
 
   *entry_computation = module->entry_computation();
   *param0 = (*entry_computation)->parameter_instruction(0);
@@ -151,7 +151,7 @@ ENTRY main {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   HloComputation* while_body = module->GetComputationWithName("body");
 
@@ -190,7 +190,7 @@ ENTRY main {
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          tools::Parse(hlo_string));
+                          ParseHloString(hlo_string));
 
   HloComputation* main = module->GetComputationWithName("main");
   HloInstruction* while_instr = main->root_instruction();
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index a62d49e9c7..7f6bbe6f87 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -117,11 +117,11 @@ cc_library(
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:computation_layout",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
@@ -138,8 +138,8 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -697,8 +697,8 @@ xla_test(
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
     ],
 )
 
@@ -1195,9 +1195,9 @@ xla_test(
     ],
     deps = [
         ":client_library_test_base",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
     ],
@@ -1520,11 +1520,11 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/xla_client:xla_builder",
         "//tensorflow/compiler/xla/client/xla_client:xla_computation",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/compiler/xla/tools/parser:hlo_parser",
         "//tensorflow/core:lib",
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
index b159887765..c960b3c15f 100644
--- a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
+++ b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 namespace xla {
 namespace {
@@ -36,7 +36,8 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) {
     p = f32[3] parameter(0)
     ROOT crs = f32[3] cross-replica-sum(p)
   })";
-  auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto module =
+      ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
   auto literal = Literal::CreateR1<float>({1, 2, 3});
   EXPECT_EQ(*literal, *ExecuteAndTransfer(std::move(module), {literal.get()}));
 }
@@ -49,7 +50,8 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) {
     p1 = f32[2] parameter(1)
     ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1)
   })";
-  auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto module =
+      ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
   auto literal0 = Literal::CreateR1<float>({1, 2, 3});
   auto literal1 = Literal::CreateR1<float>({10, 20});
   EXPECT_EQ(
@@ -68,7 +70,8 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, ConstantOperand) {
     p1 = f32[2] constant({10, 20})
     ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1)
   })";
-  auto module = tools::Parse(module_str, GetModuleConfigForTest()).ValueOrDie();
+  auto module =
+      ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
   auto literal0 = Literal::CreateR1<float>({1, 2, 3});
   auto literal1 = Literal::CreateR1<float>({10, 20});
   EXPECT_EQ(*Literal::MakeTuple({literal0.get(), literal1.get()}),
diff --git a/tensorflow/compiler/xla/tests/gather_operation_test.cc b/tensorflow/compiler/xla/tests/gather_operation_test.cc
index 4854c649c1..143ffbdeb4 100644
--- a/tensorflow/compiler/xla/tests/gather_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/gather_operation_test.cc
@@ -14,12 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 
 // NB!  TODO(b/74360564): These tests do not test out of bounds behavior since
 // that hasn't been specced yet.
@@ -41,7 +41,7 @@ class GatherOperationTest : public HloTestBase {
     HloModuleConfig config;
     config.set_debug_options(GetDebugOptionsForTest());
     TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                            tools::Parse(hlo_text, config));
+                            ParseHloString(hlo_text, config));
     EXPECT_TRUE(RunAndCompare(std::move(module), args, nullopt));
   }
 };
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 36e19e6507..08ed826c80 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -23,11 +23,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/legacy_flags/debug_options_flags.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index da4cf4ae0c..c8a05c2e9e 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -15,10 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tests/hlo_verified_test_base.h"
 
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -67,7 +67,7 @@ HloModule& HloVerifiedTestBase::module() {
 void HloVerifiedTestBase::ParseAndVerifyModule(
     tensorflow::StringPiece hlo_text) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
-  TF_ASSERT_OK_AND_ASSIGN(module_, tools::Parse(hlo_text));
+  TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
   VerifyModule();
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
index c0a2c0ca4c..9052b188ed 100644
--- a/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/reduce_hlo_test.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <array>
 
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
-#include "tensorflow/compiler/xla/tools/parser/hlo_parser.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
@@ -73,7 +73,7 @@ ENTRY reduce.1 {
 }
 )";
 
-  return tools::Parse(hlo_string);
+  return ParseHloString(hlo_string);
 }
 
 // TODO(b/72454718): XLA:GPU does not support executing code compiled without
diff --git a/tensorflow/compiler/xla/tools/parser/BUILD b/tensorflow/compiler/xla/tools/parser/BUILD
deleted file mode 100644
index 76f35afd53..0000000000
--- a/tensorflow/compiler/xla/tools/parser/BUILD
+++ /dev/null
@@ -1,73 +0,0 @@
-# Build file for the Hlo parser.
-
-licenses(["notice"])  # Apache 2.0
-
-package(
-    default_visibility = [":friends"],
-)
-
-package_group(
-    name = "friends",
-    includes = [
-        "//tensorflow/compiler/xla:friends",
-    ],
-)
-
-# Filegroup used to collect source files for dependency checking.
-filegroup(
-    name = "c_srcs",
-    data = glob([
-        "**/*.cc",
-        "**/*.h",
-    ]),
-)
-
-load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-
-cc_library(
-    name = "hlo_lexer",
-    srcs = ["hlo_lexer.cc"],
-    hdrs = [
-        "hlo_lexer.h",
-        "hlo_token.h",
-    ],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:regexp_internal",
-    ],
-)
-
-cc_library(
-    name = "hlo_parser",
-    srcs = ["hlo_parser.cc"],
-    hdrs = ["hlo_parser.h"],
-    deps = [
-        ":hlo_lexer",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/service:hlo",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_cc_test(
-    name = "hlo_parser_test",
-    size = "small",
-    srcs = ["hlo_parser_test.cc"],
-    deps = [
-        ":hlo_parser",
-        "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-    ],
-)
-- 
GitLab


From 2d71691dad337c4e7a6b5dbf18fd0ab0e6bd7cf6 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Fri, 1 Jun 2018 15:36:29 -0700
Subject: [PATCH 190/610] Swift for TensorFlow lives in GitHub, for now. Update
 ecosystem page and dropdown menu. Remove community/swift page and add
 redirect.

PiperOrigin-RevId: 198936463
---
 tensorflow/docs_src/community/leftnav_files |  1 -
 tensorflow/docs_src/community/swift.md      | 60 ---------------------
 2 files changed, 61 deletions(-)
 delete mode 100644 tensorflow/docs_src/community/swift.md

diff --git a/tensorflow/docs_src/community/leftnav_files b/tensorflow/docs_src/community/leftnav_files
index 2bae60d9dd..0bd1f14de9 100644
--- a/tensorflow/docs_src/community/leftnav_files
+++ b/tensorflow/docs_src/community/leftnav_files
@@ -6,4 +6,3 @@ groups.md
 documentation.md
 style_guide.md
 benchmarks.md
-swift.md
diff --git a/tensorflow/docs_src/community/swift.md b/tensorflow/docs_src/community/swift.md
deleted file mode 100644
index 070f9931e0..0000000000
--- a/tensorflow/docs_src/community/swift.md
+++ /dev/null
@@ -1,60 +0,0 @@
-<p align="center">
-  <img src="../images/swift_tensorflow_logo.png">
-</p>
-
-# Swift for TensorFlow
-
-Welcome to the Swift for TensorFlow development community!
-
-Swift for TensorFlow is a new way to develop machine learning models. It
-gives you the power of
-[TensorFlow](https://www.tensorflow.org) directly
-integrated into the [Swift programming language](https://swift.org/about).
-With Swift, you can write the following imperative code, and Swift
-automatically turns it into **a single TensorFlow Graph** and runs it
-with the full performance of TensorFlow Sessions on CPU, GPU and
-[TPU](https://cloud.google.com/tpu/docs/tpus).
-
-```swift
-import TensorFlow
-
-var x = Tensor<Float>([[1, 2], [3, 4]])
-
-for i in 1...5 {
-  x += matmul(x, x)
-}
-
-print(x)
-```
-
-Swift combines the flexibility of
-[Eager Execution](https://www.tensorflow.org/programmers_guide/eager) with the
-high performance of [Graphs and Sessions](https://www.tensorflow.org/programmers_guide/graphs).
-Behind the scenes, Swift analyzes your Tensor code and automatically builds
-graphs for you. Swift also catches type errors and shape mismatches before
-running your code, and has [Automatic Differentiation](https://en.wikipedia.org/wiki/Automatic_differentiation)
-built right in. We believe that machine learning tools are so important that
-they deserve **a first-class language and a compiler**.
-
-Note: Swift for TensorFlow is an early stage research project. It has been
-released to enable open source development and is not yet ready for general use
-by machine learning developers.
-
-## Open Source
-
-We have released Swift for TensorFlow as an open-source project on GitHub!
-
-Our [documentation repository](https://github.com/tensorflow/swift) contains a
-[project overview](https://github.com/tensorflow/swift/blob/master/docs/DesignOverview.md)
-and [technical papers](https://github.com/tensorflow/swift/tree/master/docs)
-explaining specific areas in depth. There are also instructions for [installing
-pre-built packages](https://github.com/tensorflow/swift/blob/master/Installation.md)
-(for macOS and Ubuntu) as well as a simple
-[usage tutorial](https://github.com/tensorflow/swift/blob/master/Usage.md).
-
-Moving forward, we will use an open design model and all discussions will be
-public.
-
-[Sign up here to join the community Google
-group](https://groups.google.com/a/tensorflow.org/d/forum/swift), which we will
-use for announcements and general discussion.
-- 
GitLab


From 25486ef05d59265b769684589b738636b3207cc7 Mon Sep 17 00:00:00 2001
From: Vinu Rajashekhar <vinuraja@google.com>
Date: Fri, 1 Jun 2018 15:44:29 -0700
Subject: [PATCH 191/610] Adds a batch-op implemented using TF functions.

o This has a couple of important advantages over the current implementation:
  1. The existing batch-op waits for the batch to be created and then forwards the tensors to the rest of the graph, which causes a lot of batches to be created, because there is no way for the op to know if the other batches are being queued up. A mitigation, which we have seen working in practice, is to actually wait for the graph to finish processing the batch. So there is a sort of flow-control happening, and meanwhile the batches get coalesced, which improves latency and throughput as well. Using functions makes this kind of approach easier.
  2. The existing op passes empty tensors around the graph to make the TF executor happy, which has sometimes worked not well with some Ops (like Reshape). Using functions means that we don't need to rely on this mechanism as well.

PiperOrigin-RevId: 198937594
---
 .../batching/python/ops/batch_ops_test.py     |  87 ++++
 .../base_api/api_def_BatchFunction.pbtxt      | 128 ++++++
 tensorflow/core/kernels/batch_kernels.cc      | 390 +++++++++++++++---
 tensorflow/core/ops/batch_ops.cc              |  20 +
 4 files changed, 564 insertions(+), 61 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt

diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index e22f978dde..68e8a88ca0 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -23,7 +23,9 @@ import time
 
 from tensorflow.contrib.batching.python.ops import batch_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_batch_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
@@ -205,6 +207,91 @@ class BatchOpsTest(test.TestCase):
       self.assertEqual(thread_results[0], [2])
       self.assertEqual(main_results[0], [3])
 
+  def testBatchFunctionOp(self):
+    """Tests that the batch_func works."""
+    with self.test_session() as sess:
+
+      @function.Defun(dtypes.int32)
+      def computation(in_t):
+        return in_t + 1
+
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = gen_batch_ops.batch_function(
+          [inp],
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,
+          Tout=[dtypes.int32],
+          f=computation,
+          captured_tensors=computation.captured_inputs)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBatchFunctionOpWithCapturedInput(self):
+    """Tests that batch_func with timeout."""
+    with self.test_session() as sess:
+      captured_inp0 = array_ops.placeholder_with_default(2, shape=[])
+      captured_inp1 = array_ops.placeholder_with_default(1, shape=[])
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+
+      @function.Defun(dtypes.int32)
+      def computation(inp):
+        return inp + captured_inp0 - captured_inp1
+
+      result = gen_batch_ops.batch_function(
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          batching_queue="",
+          f=computation,
+          in_tensors=[inp],
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg])
+
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBasicUnbatchDecoratedWithReshape(self):
+    """Tests that the batch_function decorator works."""
+    with self.test_session() as sess:
+
+      @batch_ops.batch_function(1, 10, 100000)
+      def computation(in_t):
+        return array_ops.reshape(in_t, [-1]) + 1
+
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1, 1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [[1]]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [[2]]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
   def testUnbatchTimeout(self):
     """Tests that the unbatch timeout works."""
     with self.test_session() as sess:
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
new file mode 100644
index 0000000000..09eff6177b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchFunction.pbtxt
@@ -0,0 +1,128 @@
+op {
+  graph_op_name: "BatchFunction"
+  in_arg {
+    name: "in_tensors"
+    description: <<END
+The tensors to be batched.
+END
+  }
+  in_arg {
+    name: "captured_tensors"
+    description: <<END
+The tensors which are captured in the function, and don't need
+to be batched.
+END
+  }
+  out_arg {
+    name: "out_tensors"
+    description: <<END
+The output tensors.
+END
+  }
+  attr {
+    name: "num_batch_threads"
+    description: <<END
+Number of scheduling threads for processing batches of work.
+Determines the number of batches processed in parallel.
+END
+  }
+  attr {
+    name: "max_batch_size"
+    description: <<END
+Batch sizes will never be bigger than this.
+END
+  }
+  attr {
+    name: "batch_timeout_micros"
+    description: <<END
+Maximum number of microseconds to wait before outputting
+an incomplete batch.
+END
+  }
+  attr {
+    name: "max_enqueued_batches"
+    description: <<END
+Maximum number of batches enqueued. Default: 10.
+END
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    description: <<END
+Optional list of allowed batch sizes. If left empty, does
+nothing. Otherwise, supplies a list of batch sizes, causing the op to pad
+batches up to one of those sizes. The entries must increase monotonically, and
+the final entry must equal max_batch_size.
+END
+  }
+  attr {
+    name: "container"
+    description: <<END
+Controls the scope of sharing of this batch.
+END
+  }
+  attr {
+    name: "shared_name"
+    description: <<END
+Concurrently running instances of batch in the same device with the
+same container and shared_name will batch their elements together. If left
+empty, the op name will be used as the shared name.
+END
+  }
+  attr {
+    name: "Tin"
+    description: <<END
+the types of tensors to be batched.
+END
+  }
+  attr {
+    name: "Tcaptured"
+    description: <<END
+the types of the captured tensors.
+END
+  }
+  attr {
+    name: "Tout"
+    description: <<END
+the types of the output tensors.
+END
+  }
+  summary: "Batches all the inputs tensors to the computation done by the function."
+  description: <<END
+So, for example, in the following code
+
+  ```python
+
+  # This input will be captured.
+  y = tf.placeholder_with_default(1.0, shape=[])
+
+  @tf.Defun(tf.float32)
+  def computation(a):
+    return tf.matmul(a, a) + y
+
+  b = gen_batch_ops.batch_function(
+          f=computation
+          in_tensors=[a],
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg],
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          allowed_batch_sizes=[3, 10],
+          batching_queue="")
+
+If more than one session.run call is simultaneously trying to compute `b`
+the values of `a` will be gathered, non-deterministically concatenated
+along the first axis, and only one thread will run the computation.
+
+Assumes that all arguments of the function are Tensors which will be batched
+along their first dimension.
+
+Arguments that are captured, are not batched. The session.run call which does
+the concatenation, will use the values of the captured tensors available to it.
+Therefore, typical uses of captured tensors should involve values which remain
+unchanged across session.run calls. Inference is a good example of this.
+
+SparseTensor is not supported. The return value of the decorated function
+must be a Tensor or a list/tuple of Tensors.
+END
+}
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 8c99ded0a8..c0eef229ce 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -24,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/kernels/split_lib.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/macros.h"
 
@@ -41,7 +43,7 @@ typedef Eigen::SyclDevice SYCLDevice;
 // ensure proper device placement.
 template <typename T>
 Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
-              int output_index) {
+              Tensor* output) {
   const int input_dims = inputs[0].dims();
   const TensorShape& input_shape = inputs[0].shape();
 
@@ -76,9 +78,8 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor>& inputs,
 
   TensorShape output_shape(input_shape);
   output_shape.set_dim(0, output_dim0);
-  Tensor* output = nullptr;
   TF_RETURN_IF_ERROR(
-      context->allocate_output(output_index, output_shape, &output));
+      context->allocate_temp(DataTypeToEnum<T>::value, output_shape, output));
   if (output->NumElements() > 0) {
     auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
 #if GOOGLE_CUDA
@@ -209,6 +210,7 @@ class BatchResource : public ResourceBase {
   static Status Create(int32 num_batch_threads, int32 max_batch_size,
                        int32 batch_timeout_micros, int32 max_enqueued_batches,
                        const std::vector<int32>& allowed_batch_sizes,
+                       FunctionLibraryRuntime::Handle fhandle,
                        std::unique_ptr<BatchResource>* resource) {
     std::unique_ptr<BatchResource> new_resource(new BatchResource);
 
@@ -225,6 +227,8 @@ class BatchResource : public ResourceBase {
 
     new_resource->allowed_batch_sizes_ = allowed_batch_sizes;
 
+    new_resource->fhandle_ = fhandle;
+
     *resource = std::move(new_resource);
     return Status::OK();
   }
@@ -254,6 +258,14 @@ class BatchResource : public ResourceBase {
       }
       batch_components->inputs.push_back(tensor);
     }
+    OpInputList captured_tensors;
+    const auto captured_status =
+        context->input_list("captured_tensors", &captured_tensors);
+    if (captured_status.ok()) {
+      for (const Tensor& captured_tensor : captured_tensors) {
+        batch_components->captured_inputs.push_back(captured_tensor);
+      }
+    }
     batch_components->context = context;
     batch_components->done_callback = std::move(done_callback);
 
@@ -272,6 +284,7 @@ class BatchResource : public ResourceBase {
     int64 guid;
 
     std::vector<Tensor> inputs;
+    std::vector<Tensor> captured_inputs;
     OpKernelContext* context;
     AsyncOpKernel::DoneCallback done_callback;
 
@@ -314,50 +327,32 @@ class BatchResource : public ResourceBase {
     return batch_size;
   }
 
-  // Processes a batch of one or more BatchTask entries.
-  void ProcessBatch(std::unique_ptr<Batch> batch) const {
-    if (batch->empty()) {
-      return;
+  Status ConcatInputTensors(const Batch& batch, OpKernelContext* context,
+                            std::vector<Tensor>* concatenated_tensors) const {
+    if (batch.num_tasks() == 0) {
+      return errors::InvalidArgument("Empty batch.");
     }
-    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch->size());
-    const int padding_amount = padded_batch_size - batch->size();
 
-    OpKernelContext* last_task_context =
-        batch->task(batch->num_tasks() - 1).context;
-    AsyncOpKernel::DoneCallback last_task_callback =
-        batch->task(batch->num_tasks() - 1).done_callback;
-
-    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
-                         last_task_callback);
+    const int padded_batch_size = RoundToLowestAllowedBatchSize(batch.size());
+    const int padding_amount = padded_batch_size - batch.size();
 
     // All tasks should have the same number of input edges.
-    const int num_input_edges = batch->task(0).inputs.size();
-
-    // Process each input edge one at a time (the typical case has just one).
-    for (int i = 0; i < num_input_edges; ++i) {
-      // Emit batch->num_tasks() - 1 empty output tensors.
-      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
-        const BatchTask& task = batch->task(task_idx);
-        TensorShape output_shape(task.inputs.at(i).shape());
-        output_shape.set_dim(0, 0);
-        Tensor* output = nullptr;
-        OP_REQUIRES_OK_ASYNC(
-            task.context,
-            task.context->allocate_output(i, output_shape, &output),
-            task.done_callback);
-      }
+    const int num_inputs = batch.task(0).inputs.size();
+    concatenated_tensors->reserve(num_inputs);
 
+    // Process each input one at a time (the typical case has just one).
+    for (int i = 0; i < num_inputs; ++i) {
       // Concatenate the tasks ith input tensors into a big output tensor.
       std::vector<Tensor> to_concatenate;
-      to_concatenate.reserve(batch->num_tasks());
-      for (int task_idx = 0; task_idx < batch->num_tasks(); ++task_idx) {
-        to_concatenate.push_back(batch->task(task_idx).inputs.at(i));
+      to_concatenate.reserve(batch.num_tasks());
+      for (int task_idx = 0; task_idx < batch.num_tasks(); ++task_idx) {
+        to_concatenate.push_back(batch.task(task_idx).inputs.at(i));
       }
 
       // Add padding as needed. Use the first row of the first task's tensor as
       // the data for padding.
       if (padding_amount > 0) {
-        const Tensor& padding_source = batch->task(0).inputs.at(i);
+        const Tensor& padding_source = batch.task(0).inputs.at(i);
         Tensor padding;
         if (padding_source.shape().dim_size(0) == 1) {
           padding = padding_source;
@@ -367,10 +362,10 @@ class BatchResource : public ResourceBase {
           Status slice_status;
           std::vector<Tensor> slices;
           switch (type) {
-#define CASE(type)                                                   \
-  case DataTypeToEnum<type>::value:                                  \
-    slice_status = SplitCPU<type>(last_task_context, padding_source, \
-                                  slice_sizes, &slices);             \
+#define CASE(type)                                                     \
+  case DataTypeToEnum<type>::value:                                    \
+    slice_status =                                                     \
+        SplitCPU<type>(context, padding_source, slice_sizes, &slices); \
     break;
             TF_CALL_ALL_TYPES(CASE);
 #undef CASE
@@ -379,8 +374,7 @@ class BatchResource : public ResourceBase {
                   errors::InvalidArgument("Unsupported data type: ", type);
               break;
           }
-          OP_REQUIRES_OK_ASYNC(last_task_context, slice_status,
-                               last_task_callback);
+          TF_RETURN_IF_ERROR(slice_status);
           padding = slices.at(0);
         }
         for (int i = 0; i < padding_amount; ++i) {
@@ -390,10 +384,12 @@ class BatchResource : public ResourceBase {
 
       const DataType type = to_concatenate[0].dtype();
       Status concat_status;
+      Tensor concatenated_tensor;
       switch (type) {
-#define CASE(type)                                                      \
-  case DataTypeToEnum<type>::value:                                     \
-    concat_status = Concat<type>(last_task_context, to_concatenate, i); \
+#define CASE(type)                                                   \
+  case DataTypeToEnum<type>::value:                                  \
+    concat_status =                                                  \
+        Concat<type>(context, to_concatenate, &concatenated_tensor); \
     break;
         TF_CALL_ALL_TYPES(CASE);
 #undef CASE
@@ -402,10 +398,190 @@ class BatchResource : public ResourceBase {
               errors::InvalidArgument("Unsupported data type: ", type);
           break;
       }
-      OP_REQUIRES_OK_ASYNC(last_task_context, concat_status,
-                           last_task_callback);
+      TF_RETURN_IF_ERROR(concat_status);
+      concatenated_tensors->push_back(concatenated_tensor);
     }
+    return Status::OK();
+  }
+
+  Status SplitOutputTensors(const std::vector<Tensor>& combined_outputs,
+                            Batch* batch) const {
+    DCHECK_GE(batch->num_tasks(), 1);
+    if (batch->num_tasks() < 1) {
+      return errors::Internal("Batch size expected to be positive; was ",
+                              batch->num_tasks());
+    }
+
+    std::vector<int64> task_sizes_plus_optional_padding;
+    task_sizes_plus_optional_padding.reserve(batch->num_tasks());
+    for (int i = 0; i < batch->num_tasks(); ++i) {
+      task_sizes_plus_optional_padding.push_back(batch->task(i).size());
+    }
+    const int padding_size =
+        RoundToLowestAllowedBatchSize(batch->size()) - batch->size();
+    if (padding_size > 0) {
+      task_sizes_plus_optional_padding.push_back(padding_size);
+    }
+
+    // For each output tensor name, a divided-up tensor with one entry per task.
+    std::map<string, std::vector<Tensor>> split_tensors;
+
+    DCHECK_EQ(batch->task(0).context->num_outputs(), combined_outputs.size());
+    if (combined_outputs.size() != batch->task(0).context->num_outputs()) {
+      return errors::Internal("Wrong number of batched output tensors");
+    }
+
+    // Generate 'split_tensors' and populate the context outputs.
+    for (int i = 0; i < combined_outputs.size(); ++i) {
+      const Tensor& output_tensor = combined_outputs[i];
+      if (output_tensor.shape().dims() == 0) {
+        return errors::FailedPrecondition(
+            "Batched output tensor has 0 dimensions");
+      }
+      if (output_tensor.shape().dim_size(0) != batch->size() + padding_size) {
+        return errors::FailedPrecondition(
+            "Batched output tensor's 0th dimension does not equal the sum of "
+            "the 0th dimension sizes of the input tensors");
+      }
+
+      std::vector<Tensor> split_tensor;
+      const Status split_status = tensor::Split(
+          output_tensor, task_sizes_plus_optional_padding, &split_tensor);
+      DCHECK(split_status.ok()) << split_status.ToString();
+      if (!split_status.ok()) {
+        return errors::Internal("Tensor split operation failed: ",
+                                split_status.ToString());
+      }
+      DCHECK_EQ(split_tensor.size(), task_sizes_plus_optional_padding.size());
+      if (split_tensor.size() != task_sizes_plus_optional_padding.size()) {
+        return errors::Internal(
+            "Tensor split operation did not work as expected; got ",
+            split_tensor.size(), " splits; expected ",
+            task_sizes_plus_optional_padding.size());
+      }
+
+      for (int j = 0; j < batch->num_tasks(); ++j) {
+        BatchTask& task = *(batch->mutable_task(j));
+        task.context->set_output(i, split_tensor.at(j));
+      }  // (Ignore a possible final split_tensors entry containing the
+         // padding.)
+    }
+
+    return Status::OK();
+  }
+
+  void ProcessFuncBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+
+    // Regardless of the outcome, we need to propagate the status to the
+    // individual tasks and signal that they are done. We use MakeCleanup() to
+    // ensure that this happens no matter how we exit the method below.
+    Status status;
+    bool cleanup_done = false;
+    auto cleanup_fn = [&cleanup_done, &batch](const Status& status) {
+      if (cleanup_done) {
+        return;
+      }
+      for (int i = 0; i < batch->num_tasks(); ++i) {
+        batch->mutable_task(i)->context->SetStatus(status);
+        batch->mutable_task(i)->done_callback();
+      }
+      cleanup_done = true;
+    };
+    auto finally =
+        gtl::MakeCleanup([&cleanup_fn, &status] { cleanup_fn(status); });
+
+    status = ValidateBatch(*batch);
+    if (!status.ok()) {
+      return;
+    }
+
+    std::vector<Tensor> concatenated_tensors;
+    status =
+        ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+    if (!status.ok()) {
+      return;
+    }
+    FunctionLibraryRuntime::Options opts;
+    opts.step_id = last_task_context->step_id();
+    opts.step_container = last_task_context->step_container();
+    opts.cancellation_manager = last_task_context->cancellation_manager();
+    opts.stats_collector = last_task_context->stats_collector();
+    opts.rendezvous = last_task_context->rendezvous();
+    opts.runner = last_task_context->runner();
+
+    auto* flib = last_task_context->function_library();
+    std::vector<Tensor> combined_outputs;
+    Notification done;
+    std::vector<Tensor> args(concatenated_tensors.begin(),
+                             concatenated_tensors.end());
+    const auto& captured_inputs =
+        batch->task(batch->num_tasks() - 1).captured_inputs;
+    args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
+    flib->Run(opts, fhandle_, args, &combined_outputs,
+              [&](const Status& run_status) {
+                if (!run_status.ok()) {
+                  return;
+                }
+                const auto split_status =
+                    SplitOutputTensors(combined_outputs, batch.get());
+                // We do the cleanup here as an optimization, so that it runs in
+                // the underlying TF inter-op threadpool. Running it in the
+                // threadpool, let's the ensuing ops be scheduled faster,
+                // because the executor will add them to the front of the
+                // threadpool's task queue rather than the end.
+                cleanup_fn(split_status);
+                done.Notify();
+              });
+    // By waiting for the notification we are ensuring that this thread isn't
+    // used for processing other batches, which gives the batches time to
+    // coalesce upstream. So overall the number of batches going through the
+    // devices goes down, improving latency and throughput in most cases.
+    done.WaitForNotification();
+  }
+
+  // Processes a batch of one or more BatchTask entries.
+  void ProcessBatch(std::unique_ptr<Batch> batch) const {
+    if (batch->empty()) {
+      return;
+    }
+
+    OpKernelContext* last_task_context =
+        batch->task(batch->num_tasks() - 1).context;
+    AsyncOpKernel::DoneCallback last_task_callback =
+        batch->task(batch->num_tasks() - 1).done_callback;
+
+    OP_REQUIRES_OK_ASYNC(last_task_context, ValidateBatch(*batch),
+                         last_task_callback);
+
+    // All tasks should have the same number of input edges.
+    const int num_input_edges = batch->task(0).inputs.size();
+    std::vector<Tensor> concatenated_tensors;
+    const Status concat_status =
+        ConcatInputTensors(*batch, last_task_context, &concatenated_tensors);
+    OP_REQUIRES_OK_ASYNC(last_task_context, concat_status, last_task_callback);
 
+    // Process each input edge one at a time (the typical case has just one).
+    for (int i = 0; i < num_input_edges; ++i) {
+      last_task_context->set_output(i, concatenated_tensors.at(i));
+
+      // Emit batch->num_tasks() - 1 empty output tensors.
+      for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
+        const BatchTask& task = batch->task(task_idx);
+        TensorShape output_shape(task.inputs.at(i).shape());
+        output_shape.set_dim(0, 0);
+        Tensor* output = nullptr;
+        OP_REQUIRES_OK_ASYNC(
+            task.context,
+            task.context->allocate_output(i, output_shape, &output),
+            task.done_callback);
+      }
+    }
     // Emit batch->num_tasks() - 1 empty index tensors.
     for (int task_idx = 0; task_idx < batch->num_tasks() - 1; ++task_idx) {
       const BatchTask& task = batch->task(task_idx);
@@ -463,7 +639,7 @@ class BatchResource : public ResourceBase {
     return Status::OK();
   }
 
-  // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
+  // Looks up the batcher queue for 'queue_name'. If it did't previously exist,
   // creates it.
   Status LookupOrCreateBatcherQueue(const string& queue_name,
                                     BatcherQueue** queue) {
@@ -477,7 +653,11 @@ class BatchResource : public ResourceBase {
 
     std::unique_ptr<BatcherQueue> new_queue;
     auto process_batch_callback = [this](std::unique_ptr<Batch> batch) {
-      ProcessBatch(std::move(batch));
+      if (fhandle_ == kInvalidHandle) {
+        ProcessBatch(std::move(batch));
+      } else {
+        ProcessFuncBatch(std::move(batch));
+      }
     };
     TF_RETURN_IF_ERROR(batcher_->AddQueue(batcher_queue_options_,
                                           process_batch_callback, &new_queue));
@@ -498,8 +678,99 @@ class BatchResource : public ResourceBase {
       GUARDED_BY(batcher_queues_mu_);
 
   std::vector<int32> allowed_batch_sizes_;
+  FunctionLibraryRuntime::Handle fhandle_;
 };
 
+class BatchFunctionKernel : public AsyncOpKernel {
+ public:
+  explicit BatchFunctionKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("container", &container_));
+    OP_REQUIRES_OK(c, c->GetAttr("shared_name", &shared_name_));
+    // If shared_name is not supplied, use name instead (prevent collisions by
+    // default).
+    if (shared_name_.empty()) {
+      shared_name_ = name();
+    }
+    OP_REQUIRES_OK(c, c->GetAttr("batching_queue", &batcher_queue_));
+    OP_REQUIRES_OK(c, c->GetAttr("num_batch_threads", &num_batch_threads_));
+    OP_REQUIRES_OK(c, c->GetAttr("max_batch_size", &max_batch_size_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("batch_timeout_micros", &batch_timeout_micros_));
+    OP_REQUIRES_OK(c,
+                   c->GetAttr("max_enqueued_batches", &max_enqueued_batches_));
+    OP_REQUIRES_OK(c, c->GetAttr("allowed_batch_sizes", &allowed_batch_sizes_));
+    OP_REQUIRES_OK(c, ValidateAllowedBatchSizes());
+
+    auto lib = c->function_library();
+    OP_REQUIRES(c, lib != nullptr, errors::Internal("No function library"));
+    NameAttrList func;
+    OP_REQUIRES_OK(c, c->GetAttr("f", &func));
+    OP_REQUIRES_OK(
+        c, lib->Instantiate(func.name(), AttrSlice(&func.attr()), &fhandle_));
+  }
+
+  bool IsExpensive() override { return false; }
+
+  void ComputeAsync(OpKernelContext* c, DoneCallback done) final {
+    BatchResource* br;
+    std::function<Status(BatchResource * *r)> creator = [this,
+                                                         c](BatchResource** r) {
+      std::unique_ptr<BatchResource> new_resource;
+      TF_RETURN_IF_ERROR(
+          BatchResource::Create(num_batch_threads_, max_batch_size_,
+                                batch_timeout_micros_, max_enqueued_batches_,
+                                allowed_batch_sizes_, fhandle_, &new_resource));
+      *r = new_resource.release();
+      return Status::OK();
+    };
+    OP_REQUIRES_OK_ASYNC(c,
+                         c->resource_manager()->LookupOrCreate(
+                             container_, shared_name_, &br, creator),
+                         done);
+    const Status status =
+        br->RegisterInput(random::New64(), c, batcher_queue_, done);
+    br->Unref();
+    OP_REQUIRES_OK_ASYNC(c, status, done);
+    // Assume br calls done, so nothing to do here.
+  }
+
+  // Validates 'allowed_batch_sizes_'. The entries must increase monotonically,
+  // and the last one must equal 'max_batch_size_'.
+  Status ValidateAllowedBatchSizes() const {
+    if (allowed_batch_sizes_.empty()) {
+      return Status::OK();
+    }
+    int32 last_size = 0;
+    for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
+      const int32 size = allowed_batch_sizes_.at(i);
+      if (i > 0 && size <= last_size) {
+        return errors::InvalidArgument(
+            "allowed_batch_sizes entries must be monotonically increasing");
+      }
+      if (i == allowed_batch_sizes_.size() - 1 && size != max_batch_size_) {
+        return errors::InvalidArgument(
+            "final entry in allowed_batch_sizes must equal max_batch_size");
+      }
+      last_size = size;
+    }
+    return Status::OK();
+  }
+
+ private:
+  string container_;
+  string shared_name_;
+  string batcher_queue_;
+  int32 num_batch_threads_;
+  int32 max_batch_size_;
+  int32 batch_timeout_micros_;
+  int32 max_enqueued_batches_;
+  std::vector<int32> allowed_batch_sizes_;
+  FunctionLibraryRuntime::Handle fhandle_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("BatchFunction").Device(DEVICE_CPU),
+                        BatchFunctionKernel);
+
 class BatchKernel : public AsyncOpKernel {
  public:
   explicit BatchKernel(OpKernelConstruction* c) : AsyncOpKernel(c) {
@@ -528,7 +799,8 @@ class BatchKernel : public AsyncOpKernel {
           std::unique_ptr<BatchResource> new_resource;
           TF_RETURN_IF_ERROR(BatchResource::Create(
               num_batch_threads_, max_batch_size_, batch_timeout_micros_,
-              max_enqueued_batches_, allowed_batch_sizes_, &new_resource));
+              max_enqueued_batches_, allowed_batch_sizes_, kInvalidHandle,
+              &new_resource));
           *r = new_resource.release();
           return Status::OK();
         };
@@ -539,9 +811,7 @@ class BatchKernel : public AsyncOpKernel {
     const Status status =
         br->RegisterInput(random::New64(), c, batcher_queue_, done);
     br->Unref();
-    if (!status.ok()) {
-      OP_REQUIRES_OK_ASYNC(c, status, done);
-    }
+    OP_REQUIRES_OK_ASYNC(c, status, done);
     // Assume br calls done, so nothing to do here.
   }
 
@@ -800,9 +1070,7 @@ class UnbatchKernel : public AsyncOpKernel {
                          done);
     auto status = ubr->Compute(c, done);
     ubr->Unref();
-    if (!status.ok()) {
-      OP_REQUIRES_OK_ASYNC(c, status, done);
-    }
+    OP_REQUIRES_OK_ASYNC(c, status, done);
     // Assume ubr calls done, so nothing to do here.
   }
 
@@ -840,10 +1108,12 @@ class UnbatchGradResource : public ResourceBase {
     }
 
     const DataType type = tensors[0].dtype();
+    Tensor concatenated_tensor;
     switch (type) {
-#define CASE(type)                                         \
-  case DataTypeToEnum<type>::value:                        \
-    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, 0)); \
+#define CASE(type)                                                            \
+  case DataTypeToEnum<type>::value:                                           \
+    TF_RETURN_IF_ERROR(Concat<type>(context, tensors, &concatenated_tensor)); \
+    context->set_output(0, concatenated_tensor);                              \
     break;
       TF_CALL_ALL_TYPES(CASE);
 #undef CASE
@@ -986,9 +1256,7 @@ class UnbatchGradKernel : public AsyncOpKernel {
                          done);
     Status status = ubr->Compute(c, done);
     ubr->Unref();
-    if (!status.ok()) {
-      OP_REQUIRES_OK_ASYNC(c, status, done);
-    }
+    OP_REQUIRES_OK_ASYNC(c, status, done);
     // Assume ubr calls done, so nothing to do here.
   }
 
diff --git a/tensorflow/core/ops/batch_ops.cc b/tensorflow/core/ops/batch_ops.cc
index 0a62965eed..ba7faeb5e8 100644
--- a/tensorflow/core/ops/batch_ops.cc
+++ b/tensorflow/core/ops/batch_ops.cc
@@ -19,6 +19,26 @@ limitations under the License.
 
 namespace tensorflow {
 
+REGISTER_OP("BatchFunction")
+    .Input("in_tensors: Tin")
+    .Input("captured_tensors: Tcaptured")
+    .Output("out_tensors: Tout")
+    .Attr("f: func")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("max_enqueued_batches: int = 10")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("Tin: list(type)")
+    .Attr("Tcaptured: list(type) >= 0")
+    .Attr("Tout: list(type)")
+    // TODO(apassos): Fix this shape inference function. It requires shape
+    // inference of function calls.
+    .SetShapeFn(shape_inference::UnknownShape);
+
 REGISTER_OP("Batch")
     .Input("in_tensors: T")
     .Output("batched_tensors: T")
-- 
GitLab


From fd9a647d0e79b562b99ab6d1ee4d28c2d9db8a95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 16:09:57 -0700
Subject: [PATCH 192/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 198941362
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 84 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 84 +++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 1920d0a592..43dafec6f5 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -8762,6 +8762,90 @@ op {
     version: 15
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchIFFT"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d929a5fc87..8c7333e7a4 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3049,6 +3049,90 @@ op {
     explanation: "Use FFT3D"
   }
 }
+op {
+  name: "BatchFunction"
+  input_arg {
+    name: "in_tensors"
+    type_list_attr: "Tin"
+  }
+  input_arg {
+    name: "captured_tensors"
+    type_list_attr: "Tcaptured"
+  }
+  output_arg {
+    name: "out_tensors"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "num_batch_threads"
+    type: "int"
+  }
+  attr {
+    name: "max_batch_size"
+    type: "int"
+  }
+  attr {
+    name: "batch_timeout_micros"
+    type: "int"
+  }
+  attr {
+    name: "max_enqueued_batches"
+    type: "int"
+    default_value {
+      i: 10
+    }
+  }
+  attr {
+    name: "allowed_batch_sizes"
+    type: "list(int)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "batching_queue"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Tcaptured"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchIFFT"
   input_arg {
-- 
GitLab


From 73ec24e8b75ba4f73a06756502d8bf86b2a6828b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 16:22:47 -0700
Subject: [PATCH 193/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 198942995

---
 tensorflow/go/op/wrappers.go | 94 ++++++++++++++++++------------------
 1 file changed, 47 insertions(+), 47 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 9b66850a6c..c9817e4d61 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2724,6 +2724,53 @@ func MatrixDiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	return op.Output(0)
 }
 
+// Returns a batched diagonal tensor with a given batched diagonal values.
+//
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
+//
+// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
+// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
+//
+// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
+//
+// For example:
+//
+// ```
+// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
+//
+// and diagonal.shape = (2, 4)
+//
+// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
+//                                      [0, 2, 0, 0]
+//                                      [0, 0, 3, 0]
+//                                      [0, 0, 0, 4]],
+//                                     [[5, 0, 0, 0]
+//                                      [0, 6, 0, 0]
+//                                      [0, 0, 7, 0]
+//                                      [0, 0, 0, 8]]]
+//
+// which has shape (2, 4, 4)
+// ```
+//
+// Arguments:
+//	diagonal: Rank `k`, where `k >= 1`.
+//
+// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
+func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixDiag",
+		Input: []tf.Input{
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a sequence of numbers.
 //
 // This operation creates a sequence of numbers that begins at `start` and
@@ -5198,53 +5245,6 @@ func FloorDiv(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Returns a batched diagonal tensor with a given batched diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has `k` dimensions `[I, J, K, ..., N]`, then the output is a
-// tensor of rank `k+1` with dimensions [I, J, K, ..., N, N]` where:
-//
-// `output[i, j, k, ..., m, n] = 1{m=n} * diagonal[i, j, k, ..., n]`.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [[1, 2, 3, 4], [5, 6, 7, 8]]
-//
-// and diagonal.shape = (2, 4)
-//
-// tf.matrix_diag(diagonal) ==> [[[1, 0, 0, 0]
-//                                      [0, 2, 0, 0]
-//                                      [0, 0, 3, 0]
-//                                      [0, 0, 0, 4]],
-//                                     [[5, 0, 0, 0]
-//                                      [0, 6, 0, 0]
-//                                      [0, 0, 7, 0]
-//                                      [0, 0, 0, 8]]]
-//
-// which has shape (2, 4, 4)
-// ```
-//
-// Arguments:
-//	diagonal: Rank `k`, where `k >= 1`.
-//
-// Returns Rank `k+1`, with `output.shape = diagonal.shape + [diagonal.shape[-1]]`.
-func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixDiag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the inverse permutation of a tensor.
 //
 // This operation computes the inverse of an index permutation. It takes a 1-D
-- 
GitLab


From b31498a054d55ce328a2820fd403af764c482500 Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 1 Jun 2018 16:27:45 -0700
Subject: [PATCH 194/610] Support 5-inputs LSTM kernel in TFLite (float only).

PiperOrigin-RevId: 198943559
---
 tensorflow/contrib/lite/builtin_op_data.h     |  10 +
 tensorflow/contrib/lite/kernels/lstm.cc       | 190 +++++++++++++++++-
 tensorflow/contrib/lite/kernels/register.cc   |   3 +-
 tensorflow/contrib/lite/model.cc              |   8 +
 tensorflow/contrib/lite/schema/schema.fbs     |  12 ++
 .../contrib/lite/schema/schema_generated.h    |  52 ++++-
 tensorflow/contrib/lite/testing/BUILD         |   1 +
 .../contrib/lite/testing/generate_examples.py |  13 ++
 .../contrib/lite/testing/tflite_driver.cc     |  25 ++-
 tensorflow/contrib/lite/toco/args.h           |   1 +
 .../identify_lstm_merge_inputs.cc             |   8 +-
 .../identify_lstm_split_inputs.cc             |   8 +-
 tensorflow/contrib/lite/toco/model.h          |  10 +-
 .../contrib/lite/toco/tflite/operator.cc      |  31 ++-
 .../contrib/lite/toco/toco_cmdline_flags.cc   |   6 +
 tensorflow/contrib/lite/toco/toco_flags.proto |   6 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   2 +-
 17 files changed, 355 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_op_data.h b/tensorflow/contrib/lite/builtin_op_data.h
index 52ab9ee640..c1cc4476fb 100644
--- a/tensorflow/contrib/lite/builtin_op_data.h
+++ b/tensorflow/contrib/lite/builtin_op_data.h
@@ -148,10 +148,20 @@ typedef struct {
   float beta;
 } TfLiteLocalResponseNormParams;
 
+typedef enum {
+  kTfLiteLSTMFullKernel = 0,
+  kTfLiteLSTMBasicKernel
+} TfLiteLSTMKernelType;
+
 typedef struct {
+  // Parameters for LSTM version 1.
   TfLiteFusedActivation activation;
   float cell_clip;
   float proj_clip;
+
+  // Parameters for LSTM version 2.
+  // kTfLiteLSTMBasicKernel is only supported in version 2 or above.
+  TfLiteLSTMKernelType kernel_type;
 } TfLiteLSTMParams;
 
 typedef struct {
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 990b3da055..9aae3e571b 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/contrib/lite/context.h"
 #include "tensorflow/contrib/lite/kernels/activation_functor.h"
 #include "tensorflow/contrib/lite/kernels/internal/kernel_utils.h"
+#include "tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/contrib/lite/kernels/internal/tensor.h"
 #include "tensorflow/contrib/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/contrib/lite/kernels/kernel_util.h"
 #include "tensorflow/contrib/lite/kernels/op_macros.h"
@@ -34,6 +36,17 @@ namespace ops {
 namespace builtin {
 namespace lstm {
 
+struct OpData {
+  // Which kernel type to use. Full kernel (18-inputs) or basic kernel
+  // (5-inputs).
+  TfLiteLSTMKernelType kernel_type;
+  // Only used by full kernel.
+  int scratch_tensor_index;
+};
+
+// For full inputs kernel (18-inputs).
+namespace full {
+
 // Input Tensors of size {n_batch, n_input}
 constexpr int kInputTensor = 0;
 
@@ -71,13 +84,10 @@ constexpr int kCellStateTensor = 1;
 constexpr int kOutputTensor = 2;
 
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
-  auto* scratch_tensor_index = new int;
-  context->AddTensors(context, 1, scratch_tensor_index);
-  return scratch_tensor_index;
-}
-
-void Free(TfLiteContext* context, void* buffer) {
-  delete reinterpret_cast<int*>(buffer);
+  auto* op_data = new OpData;
+  op_data->kernel_type = kTfLiteLSTMFullKernel;
+  context->AddTensors(context, 1, &op_data->scratch_tensor_index);
+  return op_data;
 }
 
 // Check that input tensor dimensions matches with each other.
@@ -233,7 +243,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 // Allocate a temporary scratch tensor. Also check that the sizes of the input
 // tensors match each other.
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  int* scratch_tensor_index = reinterpret_cast<int*>(node->user_data);
+  OpData* op_data = reinterpret_cast<OpData*>(node->user_data);
 
   // Check we have all the inputs and outputs we need.
   TF_LITE_ENSURE_EQ(context, node->inputs->size, 18);
@@ -289,7 +299,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Create a scratch buffer tensor.
   TfLiteIntArrayFree(node->temporaries);
   node->temporaries = TfLiteIntArrayCreate(1);
-  node->temporaries->data[0] = *scratch_tensor_index;
+  node->temporaries->data[0] = op_data->scratch_tensor_index;
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
   scratch_buffer->type = input->type;
   scratch_buffer->allocation_type = kTfLiteArenaRw;
@@ -447,6 +457,168 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+}  // namespace full
+
+// For basic kernel (5-inputs).
+namespace basic {
+
+enum InputTensor {
+  kInputData = 0,
+  kInputPrevActivation = 1,
+  kInputWeights = 2,
+  kInputBiases = 3,
+  kInputPrevState = 4,
+  kInputNum = 5,
+};
+
+enum OutputTensor {
+  kOutputActivation = 0,
+  kOutputState = 1,
+  kOutputConcatTemp = 2,
+  kOutputActivationTemp = 3,
+  kOutputNum = 4,
+};
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* op_data = new OpData;
+  op_data->kernel_type = kTfLiteLSTMBasicKernel;
+  // `scratch_tensor_index` is unused in this kernel.
+  op_data->scratch_tensor_index = -1;
+  return op_data;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE(context, node->inputs->size == kInputNum);
+  TF_LITE_ENSURE(context, node->outputs->size == kOutputNum);
+
+  // Only Float32 is supportted currently.
+  // TODO(ycling): Implement quantize uint8 support.
+  for (int index = 0; index < node->inputs->size; ++index) {
+    TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
+    TF_LITE_ENSURE_EQ(context, tensor->type, kTfLiteFloat32);
+  }
+
+  const TfLiteTensor* input = GetInput(context, node, kInputData);
+  const TfLiteTensor* prev_activation =
+      GetInput(context, node, kInputPrevActivation);
+  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
+  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
+  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+
+  TF_LITE_ENSURE_EQ(context, input->dims->size, 2);
+  const int num_batches = input->dims->data[0];
+
+  TF_LITE_ENSURE_EQ(context, prev_activation->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, prev_activation->dims->data[0], num_batches);
+
+  TF_LITE_ENSURE_EQ(context, weights->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, bias->dims->size, 1);
+
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->size, 2);
+  TF_LITE_ENSURE_EQ(context, prev_state->dims->data[0], num_batches);
+
+  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
+  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
+  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
+  TfLiteTensor* activation_temp =
+      GetOutput(context, node, kOutputActivationTemp);
+
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(
+                                 context, activation_out,
+                                 TfLiteIntArrayCopy(prev_activation->dims)));
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, state_out,
+                                     TfLiteIntArrayCopy(prev_state->dims)));
+  TfLiteIntArray* concat_temp_size = TfLiteIntArrayCreate(2);
+  concat_temp_size->data[0] = num_batches;
+  concat_temp_size->data[1] = weights->dims->data[1];
+  TF_LITE_ENSURE_OK(
+      context, context->ResizeTensor(context, concat_temp, concat_temp_size));
+  TfLiteIntArray* activation_temp_size = TfLiteIntArrayCreate(2);
+  activation_temp_size->data[0] = num_batches;
+  activation_temp_size->data[1] = weights->dims->data[0];
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, activation_temp,
+                                                   activation_temp_size));
+
+  // Set the state tensors as persistent.
+  for (auto index : {kInputPrevActivation, kInputPrevState}) {
+    TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
+    tensor->allocation_type = kTfLiteArenaRwPersistent;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, kInputData);
+  const TfLiteTensor* prev_activation =
+      GetInput(context, node, kInputPrevActivation);
+  const TfLiteTensor* weights = GetInput(context, node, kInputWeights);
+  const TfLiteTensor* bias = GetInput(context, node, kInputBiases);
+  const TfLiteTensor* prev_state = GetInput(context, node, kInputPrevState);
+
+  TfLiteTensor* activation_out = GetOutput(context, node, kOutputActivation);
+  TfLiteTensor* state_out = GetOutput(context, node, kOutputState);
+  TfLiteTensor* concat_temp = GetOutput(context, node, kOutputConcatTemp);
+  TfLiteTensor* activation_temp =
+      GetOutput(context, node, kOutputActivationTemp);
+
+  optimized_ops::LstmCell(
+      // Inputs.
+      GetTensorData<float>(input), GetTensorDims(input),
+      GetTensorData<float>(prev_activation), GetTensorDims(prev_activation),
+      GetTensorData<float>(weights), GetTensorDims(weights),
+      GetTensorData<float>(bias), GetTensorDims(bias),
+      GetTensorData<float>(prev_state), GetTensorDims(prev_state),
+      // Outputs.
+      GetTensorData<float>(state_out), GetTensorDims(state_out),
+      GetTensorData<float>(activation_out), GetTensorDims(activation_out),
+      GetTensorData<float>(concat_temp), GetTensorDims(concat_temp),
+      GetTensorData<float>(activation_temp), GetTensorDims(activation_temp));
+
+  // TODO(ycling): Investigate if this copy can be avoided with the 5-inputs
+  // LSTM kernel.
+  memcpy(prev_activation->data.raw, activation_out->data.raw,
+         activation_out->bytes);
+  memcpy(prev_state->data.raw, state_out->data.raw, state_out->bytes);
+
+  return kTfLiteOk;
+}
+
+}  // namespace basic
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  const auto* params = reinterpret_cast<const TfLiteLSTMParams*>(buffer);
+  switch (params->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Init(context, buffer, length);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Init(context, buffer, length);
+  }
+}
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
+  switch (op_data->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Prepare(context, node);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Prepare(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* op_data = reinterpret_cast<const OpData*>(node->user_data);
+  switch (op_data->kernel_type) {
+    case kTfLiteLSTMFullKernel:
+      return full::Eval(context, node);
+    case kTfLiteLSTMBasicKernel:
+      return basic::Eval(context, node);
+  }
+}
+
 }  // namespace lstm
 
 TfLiteRegistration* Register_LSTM() {
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index c7d72738d6..184b02dcec 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -126,7 +126,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_L2_NORMALIZATION, Register_L2_NORMALIZATION());
   AddBuiltin(BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
              Register_LOCAL_RESPONSE_NORMALIZATION());
-  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM());
+  AddBuiltin(BuiltinOperator_LSTM, Register_LSTM(), /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM,
              Register_BIDIRECTIONAL_SEQUENCE_LSTM());
   AddBuiltin(BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index ca115a1c59..8d8d74adfb 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -558,6 +558,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
             parse_activation(lstm_params->fused_activation_function());
         params->cell_clip = lstm_params->cell_clip();
         params->proj_clip = lstm_params->proj_clip();
+        switch (lstm_params->kernel_type()) {
+          case LSTMKernelType_FULL:
+            params->kernel_type = kTfLiteLSTMFullKernel;
+            break;
+          case LSTMKernelType_BASIC:
+            params->kernel_type = kTfLiteLSTMBasicKernel;
+            break;
+        }
       }
       *builtin_data = reinterpret_cast<void*>(params);
       break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 7d76134e3d..7dbb36c864 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -315,11 +315,23 @@ table LocalResponseNormalizationOptions {
   beta:float;
 }
 
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
 // An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
 table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
   fused_activation_function:ActivationFunctionType;
   cell_clip: float; // Optional, 0.0 means no clipping
   proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
 }
 
 table ResizeBilinearOptions {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 0a60fcd3d0..b1beb39b28 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -1428,6 +1428,35 @@ inline const char *EnumNameLSHProjectionType(LSHProjectionType e) {
   return EnumNamesLSHProjectionType()[index];
 }
 
+enum LSTMKernelType {
+  LSTMKernelType_FULL = 0,
+  LSTMKernelType_BASIC = 1,
+  LSTMKernelType_MIN = LSTMKernelType_FULL,
+  LSTMKernelType_MAX = LSTMKernelType_BASIC
+};
+
+inline LSTMKernelType (&EnumValuesLSTMKernelType())[2] {
+  static LSTMKernelType values[] = {
+    LSTMKernelType_FULL,
+    LSTMKernelType_BASIC
+  };
+  return values;
+}
+
+inline const char **EnumNamesLSTMKernelType() {
+  static const char *names[] = {
+    "FULL",
+    "BASIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameLSTMKernelType(LSTMKernelType e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesLSTMKernelType()[index];
+}
+
 enum CombinerType {
   CombinerType_SUM = 0,
   CombinerType_MEAN = 1,
@@ -2865,10 +2894,12 @@ struct LSTMOptionsT : public flatbuffers::NativeTable {
   ActivationFunctionType fused_activation_function;
   float cell_clip;
   float proj_clip;
+  LSTMKernelType kernel_type;
   LSTMOptionsT()
       : fused_activation_function(ActivationFunctionType_NONE),
         cell_clip(0.0f),
-        proj_clip(0.0f) {
+        proj_clip(0.0f),
+        kernel_type(LSTMKernelType_FULL) {
   }
 };
 
@@ -2877,7 +2908,8 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   enum {
     VT_FUSED_ACTIVATION_FUNCTION = 4,
     VT_CELL_CLIP = 6,
-    VT_PROJ_CLIP = 8
+    VT_PROJ_CLIP = 8,
+    VT_KERNEL_TYPE = 10
   };
   ActivationFunctionType fused_activation_function() const {
     return static_cast<ActivationFunctionType>(GetField<int8_t>(VT_FUSED_ACTIVATION_FUNCTION, 0));
@@ -2888,11 +2920,15 @@ struct LSTMOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   float proj_clip() const {
     return GetField<float>(VT_PROJ_CLIP, 0.0f);
   }
+  LSTMKernelType kernel_type() const {
+    return static_cast<LSTMKernelType>(GetField<int8_t>(VT_KERNEL_TYPE, 0));
+  }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int8_t>(verifier, VT_FUSED_ACTIVATION_FUNCTION) &&
            VerifyField<float>(verifier, VT_CELL_CLIP) &&
            VerifyField<float>(verifier, VT_PROJ_CLIP) &&
+           VerifyField<int8_t>(verifier, VT_KERNEL_TYPE) &&
            verifier.EndTable();
   }
   LSTMOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2912,6 +2948,9 @@ struct LSTMOptionsBuilder {
   void add_proj_clip(float proj_clip) {
     fbb_.AddElement<float>(LSTMOptions::VT_PROJ_CLIP, proj_clip, 0.0f);
   }
+  void add_kernel_type(LSTMKernelType kernel_type) {
+    fbb_.AddElement<int8_t>(LSTMOptions::VT_KERNEL_TYPE, static_cast<int8_t>(kernel_type), 0);
+  }
   explicit LSTMOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2928,10 +2967,12 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(
     flatbuffers::FlatBufferBuilder &_fbb,
     ActivationFunctionType fused_activation_function = ActivationFunctionType_NONE,
     float cell_clip = 0.0f,
-    float proj_clip = 0.0f) {
+    float proj_clip = 0.0f,
+    LSTMKernelType kernel_type = LSTMKernelType_FULL) {
   LSTMOptionsBuilder builder_(_fbb);
   builder_.add_proj_clip(proj_clip);
   builder_.add_cell_clip(cell_clip);
+  builder_.add_kernel_type(kernel_type);
   builder_.add_fused_activation_function(fused_activation_function);
   return builder_.Finish();
 }
@@ -6226,6 +6267,7 @@ inline void LSTMOptions::UnPackTo(LSTMOptionsT *_o, const flatbuffers::resolver_
   { auto _e = fused_activation_function(); _o->fused_activation_function = _e; };
   { auto _e = cell_clip(); _o->cell_clip = _e; };
   { auto _e = proj_clip(); _o->proj_clip = _e; };
+  { auto _e = kernel_type(); _o->kernel_type = _e; };
 }
 
 inline flatbuffers::Offset<LSTMOptions> LSTMOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LSTMOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
@@ -6239,11 +6281,13 @@ inline flatbuffers::Offset<LSTMOptions> CreateLSTMOptions(flatbuffers::FlatBuffe
   auto _fused_activation_function = _o->fused_activation_function;
   auto _cell_clip = _o->cell_clip;
   auto _proj_clip = _o->proj_clip;
+  auto _kernel_type = _o->kernel_type;
   return tflite::CreateLSTMOptions(
       _fbb,
       _fused_activation_function,
       _cell_clip,
-      _proj_clip);
+      _proj_clip,
+      _kernel_type);
 }
 
 inline ResizeBilinearOptionsT *ResizeBilinearOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/contrib/lite/testing/BUILD b/tensorflow/contrib/lite/testing/BUILD
index 74fc32a12b..80e4c5a4dd 100644
--- a/tensorflow/contrib/lite/testing/BUILD
+++ b/tensorflow/contrib/lite/testing/BUILD
@@ -155,6 +155,7 @@ cc_library(
     deps = [
         ":split",
         ":test_runner",
+        "//tensorflow/contrib/lite:builtin_op_data",
         "//tensorflow/contrib/lite:framework",
         "//tensorflow/contrib/lite/kernels:builtin_ops",
     ],
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index f07e36fc7d..9bb7a4600d 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -118,6 +118,8 @@ class ExtraTocoOptions(object):
     self.allow_custom_ops = False
     # Rnn states that are used to support rnn / lstm cells.
     self.rnn_states = None
+    # Split the LSTM inputs from 5 inoputs to 18 inputs for TFLite.
+    self.split_tflite_lstm_inputs = None
 
 
 def toco_options(data_types,
@@ -155,6 +157,11 @@ def toco_options(data_types,
     s += " --allow_custom_ops"
   if extra_toco_options.rnn_states:
     s += (" --rnn_states='" + extra_toco_options.rnn_states + "'")
+  if extra_toco_options.split_tflite_lstm_inputs is not None:
+    if extra_toco_options.split_tflite_lstm_inputs:
+      s += " --split_tflite_lstm_inputs=true"
+    else:
+      s += " --split_tflite_lstm_inputs=false"
   return s
 
 
@@ -461,6 +468,11 @@ def make_zip_of_tests(zip_path,
             sess,
             tf.global_variables() + inputs +
             outputs) if use_frozen_graph else sess.graph_def
+
+        if "split_tflite_lstm_inputs" in param_dict_real:
+          extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
+              "split_tflite_lstm_inputs"]
+
         tflite_model_binary, toco_log = toco_convert(
             graph_def.SerializeToString(), input_tensors, output_tensors,
             extra_toco_options)
@@ -2019,6 +2031,7 @@ def make_lstm_tests(zip_path):
           "time_step_size": [1],
           "input_vec_size": [3],
           "num_cells": [4],
+          "split_tflite_lstm_inputs": [True, False],
       },
   ]
 
diff --git a/tensorflow/contrib/lite/testing/tflite_driver.cc b/tensorflow/contrib/lite/testing/tflite_driver.cc
index 8cab6cd8cd..fc28faf524 100644
--- a/tensorflow/contrib/lite/testing/tflite_driver.cc
+++ b/tensorflow/contrib/lite/testing/tflite_driver.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <iostream>
 
+#include "tensorflow/contrib/lite/builtin_op_data.h"
 #include "tensorflow/contrib/lite/testing/split.h"
 
 namespace tflite {
@@ -290,12 +291,24 @@ void TfLiteDriver::ResetLSTMStateTensors() {
     const auto& node_and_reg = interpreter_->node_and_registration(node_index);
     const auto& node = node_and_reg->first;
     const auto& registration = node_and_reg->second;
-    if (registration.builtin_code == tflite::BuiltinOperator_LSTM &&
-        node.outputs->size >= 2) {
-      // The first 2 outputs of LSTM are state tensors.
-      for (int i = 0; i < 2; ++i) {
-        int node_index = node.outputs->data[i];
-        ResetTensor(node_index);
+
+    if (registration.builtin_code == tflite::BuiltinOperator_LSTM) {
+      const auto* params =
+          reinterpret_cast<const TfLiteLSTMParams*>(node.builtin_data);
+      if (params->kernel_type == kTfLiteLSTMFullKernel &&
+          node.outputs->size >= 2) {
+        // The first 2 outputs of LSTM are state tensors.
+        for (int i = 0; i < 2; ++i) {
+          int node_index = node.outputs->data[i];
+          ResetTensor(node_index);
+        }
+      } else if (params->kernel_type == kTfLiteLSTMBasicKernel &&
+                 node.inputs->size == 5) {
+        // The 2th and 5th inputs are state tensors.
+        for (int i : {1, 4}) {
+          int node_index = node.inputs->data[i];
+          ResetTensor(node_index);
+        }
       }
     }
   }
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 6c0311af0a..77bc54f191 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -242,6 +242,7 @@ struct ParsedTocoFlags {
   Arg<bool> propagate_fake_quant_num_bits = Arg<bool>(false);
   Arg<bool> allow_nudging_weights_to_use_fast_gemm_kernel = Arg<bool>(false);
   Arg<int64> dedupe_array_min_size_bytes = Arg<int64>(64);
+  Arg<bool> split_tflite_lstm_inputs = Arg<bool>(true);
 };
 
 }  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
index 3f768bfee1..5b6a984ee1 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_merge_inputs.cc
@@ -33,9 +33,10 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // Already a compact LstmCell with LstmCellOperator::NUM_INPUTS of inputs,
-  // do not need to merge cell inputs.
-  if (src_op->inputs.size() == LstmCellOperator::NUM_INPUTS) {
+  // Already a compact LstmCell. Do not need to merge cell inputs.
+  const auto* src_lstm_op = static_cast<LstmCellOperator*>(src_op);
+  if (src_lstm_op->kernel_type != LstmCellOperator::KERNEL_FULL ||
+      src_lstm_op->inputs.size() != kExtendedLstmInputCount) {
     return false;
   }
 
@@ -136,6 +137,7 @@ bool MergeLstmCellInputs::Run(Model* model, std::size_t op_index) {
 
   // Emplace a new LSTM cell operator (use basic 5 inputs kernel).
   auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+  lstm_cell_op->kernel_type = LstmCellOperator::KERNEL_BASIC;
 
   // Compact LstmCell's 5 inputs.
   lstm_cell_op->inputs.resize(LstmCellOperator::NUM_INPUTS);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
index 8e66323bd7..e6e3dfa1de 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/identify_lstm_split_inputs.cc
@@ -33,9 +33,10 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
     return false;
   }
 
-  // Already an extended LstmCell with kExtendedLstmInputCount of inputs,
-  // do not need to split cell inputs.
-  if (curr_op->inputs.size() == kExtendedLstmInputCount) {
+  const auto* curr_lstm_op = static_cast<LstmCellOperator*>(curr_op);
+  // Already an extended LstmCell. Do not need to split cell inputs.
+  if (curr_lstm_op->kernel_type != LstmCellOperator::KERNEL_BASIC ||
+      curr_lstm_op->inputs.size() != LstmCellOperator::NUM_INPUTS) {
     return false;
   }
 
@@ -56,6 +57,7 @@ bool SplitLstmCellInputs::Run(Model* model, std::size_t op_index) {
 
   // Emplace a new LstmCell operator with extended inputs (kernel/lstm.cc).
   auto lstm_cell_op = absl::make_unique<LstmCellOperator>();
+  lstm_cell_op->kernel_type = LstmCellOperator::KERNEL_FULL;
   lstm_cell_op->inputs.resize(kExtendedLstmInputCount);
   int num_input = model->GetArray(curr_op->inputs[LstmCellOperator::DATA_INPUT])
                       .shape()
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 9062c03c73..1a4f87e363 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -527,7 +527,15 @@ struct LstmCellOperator : Operator {
     ACTIV_TEMP = 3,
     NUM_OUTPUTS = 4
   };
-  LstmCellOperator() : Operator(OperatorType::kLstmCell) {}
+  enum KernelType {
+    KERNEL_BASIC = 0,
+    KERNEL_FULL = 1,
+  };
+
+  LstmCellOperator()
+      : Operator(OperatorType::kLstmCell), kernel_type(KERNEL_BASIC) {}
+
+  KernelType kernel_type;
 };
 
 // Element-wise multiplication operator.
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 84a5410839..a8518adefc 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -626,11 +626,21 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
   flatbuffers::Offset<TfLiteOptions> WriteOptions(
       const TocoOperator& op,
       flatbuffers::FlatBufferBuilder* builder) const override {
+    ::tflite::LSTMKernelType kernel_type;
+    switch (op.kernel_type) {
+      case LstmCellOperator::KERNEL_BASIC:
+        kernel_type = ::tflite::LSTMKernelType_BASIC;
+        break;
+      case LstmCellOperator::KERNEL_FULL:
+        kernel_type = ::tflite::LSTMKernelType_FULL;
+        break;
+    }
+
     // Current toco converter only supports tanh, no clip.
     return ::tflite::CreateLSTMOptions(*builder, /*fused_activation_function=*/
                                        ::tflite::ActivationFunctionType_TANH,
                                        /*cell_clip=*/0.0,
-                                       /*proj_clip=*/0.0);
+                                       /*proj_clip=*/0.0, kernel_type);
   }
 
   void ReadOptions(const TfLiteOptions& options,
@@ -638,9 +648,26 @@ class Lstm : public BuiltinOperator<LstmCellOperator, ::tflite::LSTMOptions,
     // Only support tanh activation, so check that tflite type is tanh.
     CHECK(options.fused_activation_function() ==
           ::tflite::ActivationFunctionType_TANH);
+
+    switch (options.kernel_type()) {
+      case ::tflite::LSTMKernelType_BASIC:
+        op->kernel_type = LstmCellOperator::KERNEL_BASIC;
+        break;
+      case ::tflite::LSTMKernelType_FULL:
+        op->kernel_type = LstmCellOperator::KERNEL_FULL;
+        break;
+    }
   }
 
-  int GetVersion(const Operator& op) const override { return 1; }
+  int GetVersion(const Operator& op) const override {
+    const auto& lstm_op = static_cast<const LstmCellOperator&>(op);
+    switch (lstm_op.kernel_type) {
+      case LstmCellOperator::KERNEL_FULL:
+        return 1;
+      case LstmCellOperator::KERNEL_BASIC:
+        return 2;
+    }
+  }
 };
 
 class Mean : public BuiltinOperator<MeanOperator, ::tflite::MeanOptions,
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 7786a4ada3..9c6ad673ab 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -153,6 +153,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.dedupe_array_min_size_bytes.default_value(),
            "Minimum size of constant arrays to deduplicate; arrays smaller "
            "will not be deduplicated."),
+      Flag("split_tflite_lstm_inputs",
+           parsed_flags.split_tflite_lstm_inputs.bind(),
+           parsed_flags.split_tflite_lstm_inputs.default_value(),
+           "Split the LSTM inputs from 5 tensors to 18 tensors for TFLite. "
+           "Ignored if the output format is not TFLite."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -245,6 +250,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
   READ_TOCO_FLAG(allow_nudging_weights_to_use_fast_gemm_kernel,
                  FlagRequirement::kNone);
   READ_TOCO_FLAG(dedupe_array_min_size_bytes, FlagRequirement::kNone);
+  READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 8589ca361d..15f755c104 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 19.
+// Next ID to use: 20.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -165,4 +165,8 @@ message TocoFlags {
   // Minimum size of constant arrays to deduplicate; arrays smaller will not be
   // deduplicated.
   optional int64 dedupe_array_min_size_bytes = 18 [default = 64];
+
+  // Split the LSTM inputs from 5 tensors to 18 tensors for TFLite.
+  // Ignored if the output format is not TFLite.
+  optional bool split_tflite_lstm_inputs = 19 [default = true];
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index b5531ca2f4..a648883d1f 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -263,7 +263,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
     if (!toco_flags.debug_disable_recurrent_cell_fusion()) {
       transformations.Add(new IdentifyLstmCell);
     }
-    if (output_format == TFLITE) {
+    if (output_format == TFLITE && toco_flags.split_tflite_lstm_inputs()) {
       transformations.Add(new toco::SplitLstmCellInputs);
     } else {
       transformations.Add(new toco::MergeLstmCellInputs);
-- 
GitLab


From cd368924989284864e3df2fcbae72a3892bb7afb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 16:32:20 -0700
Subject: [PATCH 195/610] Allow user to opt out of saving metagraph for TPU
 with TPUEstimator.export_output().

PiperOrigin-RevId: 198944144
---
 .../contrib/tpu/python/tpu/tpu_estimator.py    | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 4465833f88..a155de3844 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1830,6 +1830,7 @@ class TPUEstimator(estimator_lib.Estimator):
                predict_batch_size=None,
                batch_axis=None,
                eval_on_tpu=True,
+               export_to_tpu=True,
                warm_start_from=None):
     """Constructs an `TPUEstimator` instance.
 
@@ -1872,6 +1873,8 @@ class TPUEstimator(estimator_lib.Estimator):
         False or `PER_HOST_V2`, batch_axis is ignored.
       eval_on_tpu: If False, evaluation runs on CPU or GPU. In this case, the
         model_fn must return `EstimatorSpec` when called with `mode` as `EVAL`.
+      export_to_tpu: If True, `export_savedmodel()` exports a metagraph for
+        serving on TPU besides the one on CPU.
       warm_start_from: Optional string filepath to a checkpoint or SavedModel to
                        warm-start from, or a `tf.estimator.WarmStartSettings`
                        object to fully configure warm-starting.  If the string
@@ -1943,6 +1946,8 @@ class TPUEstimator(estimator_lib.Estimator):
         use_tpu,
         eval_on_tpu)
 
+    self._export_to_tpu = export_to_tpu
+
     self._is_input_fn_invoked = None
 
   def _add_meta_graph_for_mode(self,
@@ -1965,11 +1970,11 @@ class TPUEstimator(estimator_lib.Estimator):
                                                        save_variables,
                                                        mode=mode)
 
-    input_receiver_fn_map = {_REWRITE_FOR_INFERENCE_MODE:
-                             input_receiver_fn_map[mode]}
-    export_tags = [tag_constants.SERVING, tag_constants.TPU]
-    mode = _REWRITE_FOR_INFERENCE_MODE
-    try:
+    if self._export_to_tpu:
+      input_receiver_fn_map = {_REWRITE_FOR_INFERENCE_MODE:
+                               input_receiver_fn_map[mode]}
+      export_tags = [tag_constants.SERVING, tag_constants.TPU]
+      mode = _REWRITE_FOR_INFERENCE_MODE
       (super(TPUEstimator, self).
        _add_meta_graph_for_mode(builder,
                                 input_receiver_fn_map,
@@ -1978,9 +1983,6 @@ class TPUEstimator(estimator_lib.Estimator):
                                 save_variables=False,
                                 mode=mode,
                                 export_tags=export_tags))
-    except Exception as error:  # pylint: disable=broad-except
-      logging.warning('Saving meta graph for TPU failed: {}.'
-                      .format(str(error)))
 
   def _call_model_fn(self, features, labels, mode, config):
     if mode == _REWRITE_FOR_INFERENCE_MODE:
-- 
GitLab


From f84e8257aa88fa45cc7a15835ad386565cd60237 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 16:48:10 -0700
Subject: [PATCH 196/610] Change the Eigen reduction code to use a tree to
 improve numerical stability. This changes the InnerMostDimReducer to use a
 summation tree, which is more numerically stable than the previous approach
 of sequential addition into an accumulator. This solves the issue for
 reduction over all or a trailing subset of dimensions. This change does not
 improve the numerical accuracy for MeanReducer, which maintains state.

Benchmarks show a 40% (AVX) to 50% (SSE) slowdown for small row reductions (sum, float). column- and full reductions are unchanged.

Cleaned up TensorFunctors.h a bit by moving the traits to reducer_traits and updating the code that uses the reducers accordingly.

Introduced a new trait "IsExactlyAssociative" and new template specializations of InnerMostDimReducer to ensure that we only invoke the new and slightly more expensive codepath when it is needed, i.e. for sum reduction of non-integer types.

PiperOrigin-RevId: 198946075
---
 tensorflow/core/kernels/eigen_pooling.h | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/eigen_pooling.h b/tensorflow/core/kernels/eigen_pooling.h
index 2f83780525..56de6b1d43 100644
--- a/tensorflow/core/kernels/eigen_pooling.h
+++ b/tensorflow/core/kernels/eigen_pooling.h
@@ -372,16 +372,23 @@ struct reducer_traits<AvgPoolMeanReducer<float>, Device> {
     Cost = 1,
 #if (EIGEN_ARCH_i386 || EIGEN_ARCH_x86_64) && !defined(__CUDACC__)
     // We only support packet access for floats.
-    PacketAccess = true
+    PacketAccess = true,
 #else
-    PacketAccess = false
+    PacketAccess = false,
 #endif
+    IsStateful = true,
+    IsExactlyAssociative = false
   };
 };
 
 template <>
 struct reducer_traits<AvgPoolMeanReducer<float>, GpuDevice> {
-  enum { Cost = 1, PacketAccess = false };
+  enum {
+    Cost = 1,
+    PacketAccess = false,
+    IsStateful = true,
+    IsExactlyAssociative = false
+  };
 };
 
 }  // namespace internal
-- 
GitLab


From da63752d84b65b238dfcdacb550b41661d0cf211 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Fri, 1 Jun 2018 17:07:29 -0700
Subject: [PATCH 197/610] Internal change.

PiperOrigin-RevId: 198948296
---
 tensorflow/workspace.bzl | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e4b7f9a695..c072f89965 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -167,8 +167,7 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "gemmlowp",
       urls = [
-          # TODO (yongtang): uncomment once mirror.bazel.build is propagated.
-          # "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
+          "https://mirror.bazel.build/github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
           "https://github.com/google/gemmlowp/archive/38ebac7b059e84692f53e5938f97a9943c120d98.zip",
       ],
       sha256 = "b87faa7294dfcc5d678f22a59d2c01ca94ea1e2a3b488c38a95a67889ed0a658",
-- 
GitLab


From 3dd460bb419776e6a4804843eec98e4bf14fdcdd Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 1 Jun 2018 17:21:55 -0700
Subject: [PATCH 198/610] Add an explanatory comment.

PiperOrigin-RevId: 198949796
---
 tensorflow/compiler/aot/tests/BUILD | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index fd2cf2b67d..0ecc3feeb6 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -7,6 +7,10 @@ package(
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+# We disable some tfcompile tests in the open source build with the
+# "manual" tag to avoid making our OSS users build LLVM twice
+# (once for host and once for target).
+
 test_suite(
     name = "all_tests",
     tags = ["manual"],
-- 
GitLab


From b33ba9a8e7e20e4b2378937204fe74af69982906 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Fri, 1 Jun 2018 18:00:43 -0700
Subject: [PATCH 199/610] Remove use of absl::make_unique

absl is not yet ready for use by open source TensorFlow. :-(

PiperOrigin-RevId: 198952953
---
 tensorflow/contrib/cloud/kernels/gcs_config_ops.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
index ef4998212e..648a219fb8 100644
--- a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include "tensorflow/core/platform/cloud/oauth_client.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -96,7 +97,8 @@ class GcsCredentialsOpKernel : public OpKernel {
         errors::InvalidArgument("JSON format incompatible; did not find fields "
                                 "`refresh_token` or `private_key`."));
 
-    auto provider = absl::make_unique<ConstantAuthProvider>(json, ctx->env());
+    auto provider =
+        tensorflow::MakeUnique<ConstantAuthProvider>(json, ctx->env());
 
     // Test getting a token
     string dummy_token;
@@ -121,7 +123,7 @@ class GcsCredentialsOpKernel : public OpKernel {
           initial_retry_delay_usec_(initial_retry_delay_usec) {}
 
     ConstantAuthProvider(const Json::Value& json, Env* env)
-        : ConstantAuthProvider(json, absl::make_unique<OAuthClient>(), env,
+        : ConstantAuthProvider(json, tensorflow::MakeUnique<OAuthClient>(), env,
                                kInitialRetryDelayUsec) {}
 
     ~ConstantAuthProvider() override {}
-- 
GitLab


From 6e5606fce0e4615880e2685a3674c498756b9cfb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 18:01:58 -0700
Subject: [PATCH 200/610] Extract FoldMultiplyIntoConv optimization stage.

PiperOrigin-RevId: 198953044
---
 .../optimizers/arithmetic_optimizer.cc        | 214 ++++++++++--------
 .../optimizers/arithmetic_optimizer.h         |   1 +
 .../optimizers/arithmetic_optimizer_test.cc   |  76 ++++---
 3 files changed, 172 insertions(+), 119 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index ca3f84a81d..400af82627 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -1958,6 +1958,127 @@ class ReorderCastAndTranspose : public ArithmeticOptimizerStage {
   bool IsNumberType(DataType dtype) { return kNumberTypes.Contains(dtype); }
 };
 
+// Fold a multiply of a scalar into the following convolution. This folding
+// can jump across nodes that merely reorders data (such as reshape and
+// transpose). For example, we can optimize
+//
+//
+//         Conv2D                             Conv2D
+//        /      \                           /      \
+//    Transpose  weights*       ->     Transpose    Mul
+//       |                                |        /   \
+//      Mul                               |    weights  scale
+//     /   \                              |
+//   input  scale**                     input
+//
+//  *) weights must be a const
+// **) scale must be a const scalar
+//
+// When `weights` and `scale` are constant, `Mul` in the optimized graph can be
+// constant-folded, also weights tend to be smaller than the activations.
+//
+// TODO(jingyue): Fold scalar multiplies to Conv?DBackpropFilter and
+// Conv?DBackpropInput.
+class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
+ public:
+  explicit FoldMultiplyIntoConv(const GraphOptimizerContext& ctx,
+                                const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FoldMultiplyIntoConv", ctx, ctx_ext) {}
+  ~FoldMultiplyIntoConv() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsConv2D(*node) || IsConv3D(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+#define TF_RETURN_IF_TRUE(...) \
+  if ((__VA_ARGS__)) return Status::OK()
+
+    NodeDef* conv = node;
+
+    NodeDef* weights;
+    TF_RETURN_IF_ERROR(GetInputNode(conv->input(1), &weights));
+
+    // Fold the multiply to conv only when the weights are constant, so the
+    // multiply can be constant-folded.
+    //
+    // TODO(jingyue): When the weights aren't constant, this should also help
+    // performance a bit and memory usage a lot, since the weights tend to be
+    // smaller than the activations.
+    TF_RETURN_IF_TRUE(!IsConstant(*weights));
+
+    // Verify that this node was not already optimized.
+    const string scaled_weights_node_name =
+        OptimizedNodeName(ParseNodeScopeAndName(weights->name()),
+                          strings::StrCat("scaled", "_", conv->name()));
+
+    TF_RETURN_IF_TRUE(ctx().node_map->NodeExists(scaled_weights_node_name));
+
+    // Find the tail of value preserving chain entering the Conv node.
+    NodeDef* tail = GetTailOfValuePreservingChain(*conv, *ctx().node_map,
+                                                  *ctx().nodes_to_preserve);
+
+    NodeDef* source;
+    TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &source));
+
+    // Check that value preserving chain is the only consumer of the Mul output.
+    TF_RETURN_IF_TRUE(!IsMul(*source));
+    TF_RETURN_IF_TRUE(NumNonControlOutputs(*source, *ctx().node_map) != 1);
+
+    const NodeDef* mul = source;
+
+    // TODO(jingyue): handle the case where `scale` is 0-th operand.
+    NodeDef* scale;  // scalar multiplier fot the input tensor
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(1), &scale));
+    TF_RETURN_IF_ERROR(GetInputNode(mul->input(0), &input));
+
+    // Check that 'scale * weight' can be const folded.
+    TF_RETURN_IF_TRUE(!IsConstant(*scale));
+    TF_RETURN_IF_TRUE(scale->attr().at("dtype").type() !=
+                      weights->attr().at("dtype").type());
+
+    // Check that `scale` is a scalar.
+    const TensorProto& scale_tensor = scale->attr().at("value").tensor();
+    bool scale_is_a_scalar = scale_tensor.has_tensor_shape() &&
+                             scale_tensor.tensor_shape().dim_size() == 0;
+    TF_RETURN_IF_TRUE(!scale_is_a_scalar);
+
+    // At this point all preconditions are met, and we safely do the rewrite.
+    VLOG(3) << "Fold multiply into conv: conv=" << conv->name()
+            << " mul=" << mul->name() << " weights=" << weights->name();
+
+    // Create new node `scaled_weights`.
+    NodeDef* scaled_weights = AddEmptyNode(scaled_weights_node_name);
+    scaled_weights->set_op("Mul");
+    scaled_weights->set_device(weights->device());
+    (*scaled_weights->mutable_attr())["T"] = weights->attr().at("dtype");
+    AddToOptimizationQueue(scaled_weights);
+
+    // Link in its inputs.
+    scaled_weights->add_input(conv->input(1));
+    ctx().node_map->AddOutput(weights->name(), scaled_weights->name());
+    scaled_weights->add_input(mul->input(1));
+    ctx().node_map->AddOutput(scale->name(), scaled_weights->name());
+    ForwardControlDependencies(scaled_weights, {source});
+
+    // Update `conv`'s weights to `scaled_weights`.
+    conv->set_input(1, scaled_weights->name());
+    ctx().node_map->UpdateInput(conv->name(), weights->name(),
+                                scaled_weights->name());
+    AddToOptimizationQueue(conv);
+
+    // Update `tail` node to bypass `mul` because it's folded to the weights.
+    tail->set_input(0, mul->input(0));
+    ctx().node_map->UpdateInput(tail->name(), mul->name(), input->name());
+    AddToOptimizationQueue(tail);
+    *simplified_node_name = conv->name();
+
+    return Status::OK();
+#undef TF_RETURN_IF_TRUE
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2210,97 +2331,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
 // ArithmeticOptimizerStage
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-  // Fold a multiply of a scalar into the following convolution. This folding
-  // can jump across nodes that merely reorders data (such as reshape and
-  // transpose). For example, we can optimize
-  //
-  //
-  //         Conv2D
-  //        /      \
-  //    Transpose  weights
-  //       |
-  //      Mul
-  //     /   \
-  //   inputs 255.0
-  //
-  // to
-  //
-  //         Conv2D
-  //        /      \
-  //    Transpose   Mul
-  //       |       /   \
-  //       |   weights  255.0
-  //       |
-  //     inputs
-  //
-  // when `weights` are constant. `Mul` in the optimized graph can be
-  // constant-folded.
-  //
-  // TODO(jingyue): Fold scalar multiplies to Conv?DBackpropFilter and
-  // Conv?DBackpropInput.
-  if (node->op() == "Conv2D" || node->op() == "Conv3D") {
-    NodeDef* conv = const_cast<NodeDef*>(node);
-    const NodeDef* weights = node_map_->GetNode(NodeName(conv->input(1)));
-    // Fold the multiply to conv only when the weights are constant, so the
-    // multiply can be constant-folded. TODO(jingyue): When the weights aren't
-    // constant, this should also help performance a bit and memory usage a lot,
-    // since the weights tend to be smaller than the activations.
-    if (weights->op() == "Const" &&
-        !OptimizedNodeExists(*weights, StrCat("scaled_", conv->name()))) {
-      const NodeDef* source = node_map_->GetNode(
-          GetTailOfValuePreservingChain(*node, *node_map_, nodes_to_preserve_)
-              ->input(0));
-      if (source->op() == "Mul" &&
-          node_map_->GetOutputs(source->name()).size() == 1) {
-        const NodeDef* mul = source;
-        // `scale` is the scalar multiplier, and `other` is the other operand.
-        // TODO(jingyue): handle the case where `scale` is 0-th operand.
-        const NodeDef* scale = node_map_->GetNode(mul->input(1));
-        const NodeDef* other = node_map_->GetNode(mul->input(0));
-        if (scale->op() == "Const" && scale->attr().at("dtype").type() ==
-                                          weights->attr().at("dtype").type()) {
-          const TensorProto& scale_tensor = scale->attr().at("value").tensor();
-          // Test whether `scale` is a scalar.
-          if (scale_tensor.has_tensor_shape() &&
-              scale_tensor.tensor_shape().dim_size() == 0) {
-            // Create new node `scaled_weights`.
-            NodeDef* scaled_weights = AddNode(
-                *weights, StrCat("scaled_", conv->name()), /*copy_node=*/false);
-            scaled_weights->set_op("Mul");
-            scaled_weights->set_device(weights->device());
-            (*scaled_weights->mutable_attr())["T"] =
-                weights->attr().at("dtype");
-            nodes_to_simplify->PushBack(scaled_weights);
-
-            // Link in its inputs.
-            scaled_weights->add_input(conv->input(1));
-            node_map_->AddOutput(weights->name(), scaled_weights->name());
-            scaled_weights->add_input(mul->input(1));
-            node_map_->AddOutput(scale->name(), scaled_weights->name());
-            ForwardControlDependencies(scaled_weights, {source});
-
-            // Update `conv`'s weights to `scaled_weights`.
-            conv->set_input(1, scaled_weights->name());
-            node_map_->UpdateInput(conv->name(), weights->name(),
-                                   scaled_weights->name());
-            nodes_to_simplify->PushBack(conv);
-
-            // Update `mul`'s consumer to bypass `mul` because it's folded to
-            // the weights.
-            CHECK_EQ(node_map_->GetOutputs(mul->name()).size(), 1);
-            NodeDef* consumer_of_mul =
-                *node_map_->GetOutputs(mul->name()).begin();
-            consumer_of_mul->set_input(0, mul->input(0));
-            node_map_->UpdateInput(consumer_of_mul->name(), mul->name(),
-                                   other->name());
-            nodes_to_simplify->PushBack(consumer_of_mul);
-            return conv->name();
-          }
-        }
-      }
-    }
-  }
-
   if (node->op() == "Mul" && node->input(0) == node->input(1) &&
       !OptimizedNodeExists(*node, "square")) {
     const DataType type = GetDataTypeFromAttr(*node, "T");
@@ -2480,6 +2510,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
 
   if (options_.combine_add_to_addn && can_use_shapes)
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.fold_multiply_into_conv)
+    pipeline.AddStage<FoldMultiplyIntoConv>(ctx, ctx_ext);
   if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
   if (options_.minimize_broadcasts && can_use_shapes)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 0fce23a40a..ce3c633baf 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -61,6 +61,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool combine_add_to_addn = true;
     bool convert_sqrt_div_to_rsqrt_mul = false;
     bool dedup_computations = true;
+    bool fold_multiply_into_conv = true;
     bool hoist_common_factor_out_of_aggregation = true;
     bool hoist_cwise_unary_chains = false;
     bool minimize_broadcasts = true;
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 02f76df025..b9fec0f860 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -126,6 +126,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.fold_multiply_into_conv = false;
     options.hoist_common_factor_out_of_aggregation = false;
     options.hoist_cwise_unary_chains = false;
     options.minimize_broadcasts = false;
@@ -150,6 +151,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.combine_add_to_addn = true;
   }
 
+  void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_multiply_into_conv = true;
+  }
+
   void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
@@ -1462,18 +1468,24 @@ TEST_F(ArithmeticOptimizerTest, FoldMulToTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
-
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyFoldMultipleIntoConv(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output);
 
   NodeMap node_map(&output);
+
   // `conv` is now a folded convolution with scaled weights.
   const NodeDef* folded_conv = node_map.GetNode(conv.node()->name());
-  CHECK_EQ(node_map.GetNode(NodeName(folded_conv->input(1)))->op(), "Mul");
+  ASSERT_NE(folded_conv, nullptr);
+
+  const NodeDef* folded_conv_weights = node_map.GetNode(folded_conv->input(1));
+  ASSERT_NE(folded_conv_weights, nullptr);
+  EXPECT_EQ("Mul", folded_conv_weights->op());
+
   // Its input should be a transpose of `inputs`.
   const NodeDef* transpose = node_map.GetNode(NodeName(folded_conv->input(0)));
-  CHECK_EQ(NodeName(transpose->input(0)), inputs.node()->name());
+  ASSERT_NE(transpose, nullptr);
+  EXPECT_EQ("inputs", transpose->input(0));
 }
 
 TEST_F(ArithmeticOptimizerTest, NotFoldMulAcrossPreservedTranspose) {
@@ -1574,28 +1586,32 @@ TEST_F(ArithmeticOptimizerTest, OptimizeCastMulTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  ArithmeticOptimizer optimizer;
+  ArithmeticOptimizer optimizer;  // all optimization stages are on
   OptimizeTwiceAndPrune(&optimizer, &item, &output, /*const_folding=*/true);
 
   NodeMap node_map(&output);
 
-  // Expected names for the optimized nodes.
+  // Expected names for reordered cast and transpose.
   const string p = "ArithmeticOptimizer/ReorderCastAndTranspose_";
   const string optimized_cast_name = strings::StrCat(p, "float_Cast");
   const string optimized_transpose_name = strings::StrCat(p, "uint8_Transpose");
 
+  // Expected names for folded multiply and conv.
+  const string optimized_weights =
+      "ArithmeticOptimizer/FoldMultiplyIntoConv_scaled_Conv2D_weights";
+
   const NodeDef* inputs_node = node_map.GetNode("Placeholder");
   const NodeDef* transpose_node = node_map.GetNode(optimized_transpose_name);
   const NodeDef* cast_node = node_map.GetNode(optimized_cast_name);
-  const NodeDef* weights_node =
-      node_map.GetNode(OptimizedName("weights_scaled_Conv2D"));
+
+  const NodeDef* weights_node = node_map.GetNode(optimized_weights);
   const NodeDef* conv_node = node_map.GetNode("Conv2D");
 
-  ASSERT_TRUE(inputs_node != nullptr);
-  ASSERT_TRUE(transpose_node != nullptr);
-  ASSERT_TRUE(cast_node != nullptr);
-  ASSERT_TRUE(weights_node != nullptr);
-  ASSERT_TRUE(conv_node != nullptr);
+  ASSERT_NE(inputs_node, nullptr);
+  ASSERT_NE(transpose_node, nullptr);
+  ASSERT_NE(cast_node, nullptr);
+  ASSERT_NE(weights_node, nullptr);
+  ASSERT_NE(conv_node, nullptr);
 
   EXPECT_EQ(output.node_size(), 7);
   EXPECT_EQ(transpose_node->input(0), inputs_node->name());
@@ -1627,23 +1643,27 @@ TEST_F(ArithmeticOptimizerTest, OptimizeMultipleMulTransposeConv) {
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
   GraphDef output;
-  TF_EXPECT_OK(ArithmeticOptimizer().Optimize(nullptr, item, &output));
+  ArithmeticOptimizer optimizer;
+  EnableOnlyFoldMultipleIntoConv(&optimizer);
+  OptimizeTwiceAndPrune(&optimizer, &item, &output, /*const_folding=*/true);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(
-      ConstantFolding(/*cpu_device=*/nullptr).Optimize(nullptr, item, &output));
+  NodeMap node_map(&output);
 
-  item.graph.Swap(&output);
-  TF_EXPECT_OK(ModelPruner().Optimize(nullptr, item, &output));
+  using strings::StrCat;
+  const string p = "ArithmeticOptimizer/FoldMultiplyIntoConv_";
+  const string optimized_weights = StrCat(p, "scaled_Conv2D_weights");
+  const string optimized_weights_1 = StrCat(p, "scaled_Conv2D_1_weights_1");
 
-  NodeMap node_map(&output);
-  const NodeDef* weights_node =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D")));
-  const NodeDef* conv_node = CHECK_NOTNULL(node_map.GetNode("Conv2D"));
+  const NodeDef* weights_node = node_map.GetNode(optimized_weights);
+  const NodeDef* weights_node_1 = node_map.GetNode(optimized_weights_1);
+  const NodeDef* conv_node = node_map.GetNode("Conv2D");
+  const NodeDef* conv_node_1 = node_map.GetNode("Conv2D_1");
+
+  ASSERT_NE(weights_node, nullptr);
+  ASSERT_NE(weights_node_1, nullptr);
+  ASSERT_NE(conv_node, nullptr);
+  ASSERT_NE(conv_node_1, nullptr);
 
-  const NodeDef* weights_node_1 =
-      CHECK_NOTNULL(node_map.GetNode(OptimizedName("weights_scaled_Conv2D_1")));
-  const NodeDef* conv_node_1 = CHECK_NOTNULL(node_map.GetNode("Conv2D_1"));
   EXPECT_EQ(conv_node->input(1), weights_node->name());
   EXPECT_EQ(conv_node_1->input(1), weights_node_1->name());
 }
-- 
GitLab


From d81328115bd10de70570c46dbfc683cd0238d779 Mon Sep 17 00:00:00 2001
From: Kay Zhu <kayzhu@google.com>
Date: Fri, 1 Jun 2018 18:09:31 -0700
Subject: [PATCH 201/610] [XLA] Add comments for the Reduce->Reshape simplifier
 pass.

Also forcing reduction order for init to be on lhs for ReduceWindow->Map pass.

PiperOrigin-RevId: 198953817
---
 tensorflow/compiler/xla/service/algebraic_simplifier.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index e1a45e453e..dc5f1b31bf 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1774,6 +1774,10 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
                     new_reduce_dimensions, function));
   }
 
+  // If the reduction results in the same number of elements, then the only
+  // possible side effect would be a reshape. Since the init_value is an
+  // identity of the reduction function, we can therefore replace the reduce
+  // with a simple reshape, ignoring the reduction function completely.
   if (ShapeUtil::ElementsIn(reduce->shape()) ==
       ShapeUtil::ElementsIn(arg->shape())) {
     return ReplaceWithNewInstruction(
@@ -1842,7 +1846,7 @@ Status AlgebraicSimplifierVisitor::HandleReduceWindow(
     return ReplaceWithNewInstruction(
         reduce_window,
         HloInstruction::CreateMap(reduce_window->shape(),
-                                  {operand, reduce_window->mutable_operand(1)},
+                                  {reduce_window->mutable_operand(1), operand},
                                   function));
   }
 
-- 
GitLab


From dbdd276a05c417963b3f06f71e801540bde9ab7c Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Fri, 1 Jun 2018 18:30:32 -0700
Subject: [PATCH 202/610] Quantize weights transformation for toco.

Finds float weight tensors, quantizes them to 8 bits, and adds Dequantize operations after them.

PiperOrigin-RevId: 198955123
---
 tensorflow/contrib/lite/toco/BUILD            |   1 +
 tensorflow/contrib/lite/toco/args.h           |   1 +
 .../lite/toco/g3doc/cmdline_reference.md      |   4 +
 .../graph_transformations.h                   |   1 +
 .../graph_transformations/quantize_weights.cc | 108 +++++++++++
 .../toco/graph_transformations/tests/BUILD    |  20 ++-
 .../tests/quantize_weights_test.cc            | 167 ++++++++++++++++++
 .../resolve_constant_concatenation_test.cc    |   4 +-
 .../contrib/lite/toco/toco_cmdline_flags.cc   |  11 ++
 tensorflow/contrib/lite/toco/toco_flags.proto |   7 +-
 tensorflow/contrib/lite/toco/toco_tooling.cc  |   3 +
 11 files changed, 319 insertions(+), 8 deletions(-)
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc
 create mode 100644 tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc

diff --git a/tensorflow/contrib/lite/toco/BUILD b/tensorflow/contrib/lite/toco/BUILD
index b8acc9a8e0..7ea4f32ef6 100644
--- a/tensorflow/contrib/lite/toco/BUILD
+++ b/tensorflow/contrib/lite/toco/BUILD
@@ -245,6 +245,7 @@ cc_library(
         "graph_transformations/quantization_util.cc",
         "graph_transformations/quantization_util.h",
         "graph_transformations/quantize.cc",
+        "graph_transformations/quantize_weights.cc",
         "graph_transformations/read_fake_quant_min_max.cc",
         "graph_transformations/remove_final_dequantize_op.cc",
         "graph_transformations/remove_tensorflow_assert.cc",
diff --git a/tensorflow/contrib/lite/toco/args.h b/tensorflow/contrib/lite/toco/args.h
index 77bc54f191..9f5ca66d05 100644
--- a/tensorflow/contrib/lite/toco/args.h
+++ b/tensorflow/contrib/lite/toco/args.h
@@ -234,6 +234,7 @@ struct ParsedTocoFlags {
   Arg<bool> drop_fake_quant = Arg<bool>(false);
   Arg<bool> reorder_across_fake_quant = Arg<bool>(false);
   Arg<bool> allow_custom_ops = Arg<bool>(false);
+  Arg<bool> quantize_weights = Arg<bool>(false);
   // Deprecated flags
   Arg<string> input_type;
   Arg<string> input_types;
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index 9e99287f82..a8381169b8 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -203,6 +203,10 @@ have.
     graph transformations on them, at the cost of no longer faithfully matching
     inference and training arithmetic.
 
+*   `--quantize_weights`. Type: boolean. Default: false. Store weights as
+    quantized weights followed by dequantize operations. Computation is still
+    done in float, but reduces model size (at the cost of accuracy and latency).
+
 ## Logging flags
 
 The following are standard Google logging flags:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
index 8da242aa9c..1bc7557d46 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h
@@ -139,6 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
+DECLARE_GRAPH_TRANSFORMATION(QuantizeWeights)
 DECLARE_GRAPH_TRANSFORMATION(RemoveFinalDequantizeOp)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowAssert)
 DECLARE_GRAPH_TRANSFORMATION(RemoveTensorFlowIdentity)
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc
new file mode 100644
index 0000000000..88ea0945e7
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize_weights.cc
@@ -0,0 +1,108 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/quantization_util.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+namespace {
+
+// The minimum number of elements a weights array must have to be quantized
+// by this transformation.
+// TODO(suharshs): Make this minimum size configurable.
+const int kWeightsMinSize = 1024;
+
+// Gets the quantization params from the float array.
+void GetQuantizationParamsFromArray(const Array& array,
+                                    QuantizationParams* params) {
+  const std::vector<float>& float_vals =
+      array.GetBuffer<ArrayDataType::kFloat>().data;
+  auto minmax = std::minmax_element(float_vals.begin(), float_vals.end());
+  MinMax toco_minmax;
+  toco_minmax.min = *minmax.first;
+  toco_minmax.max = *minmax.second;
+  GetQuantizationParams(ArrayDataType::kUint8, toco_minmax, params);
+}
+
+}  // namespace
+
+bool QuantizeWeights::Run(Model* model, std::size_t op_index) {
+  const auto op_it = model->operators.begin() + op_index;
+  Operator* op = op_it->get();
+
+  // Get the weights tensor, if the current operator has one.
+  int weights_index;
+  if (op->type == OperatorType::kConv ||
+      op->type == OperatorType::kDepthwiseConv ||
+      op->type == OperatorType::kFullyConnected) {
+    weights_index = 1;
+  } else if (op->type == OperatorType::kLstmCell) {
+    weights_index = LstmCellOperator::WEIGHTS_INPUT;
+  } else {
+    return false;
+  }
+
+  // Return early if the array isn't a constant param, this can happen in early
+  // transformation passes until transpose operations following the weight array
+  // are resolved.
+  const string weights = op->inputs[weights_index];
+  if (!IsConstantParameterArray(*model, weights)) {
+    return false;
+  }
+
+  // Return early if the weight tensor is not type float.
+  Array& weights_array = model->GetArray(weights);
+  if (weights_array.data_type != ArrayDataType::kFloat) {
+    return false;
+  }
+
+  // Return early if the tensor is too small. Small tensors don't take up too
+  // much space and can result in bad quantization results.
+  if (weights_array.GetBuffer<ArrayDataType::kFloat>().data.size() <
+      kWeightsMinSize) {
+    return false;
+  }
+
+  // Quantize the weight tensor to type kUint8.
+  QuantizationParams params;
+  GetQuantizationParamsFromArray(weights_array, &params);
+  QuantizeArray(this, model, weights, ArrayDataType::kUint8, params);
+
+  // Insert a Dequantize operation after the quantized weights tensor.
+  auto* dequantize_op = new DequantizeOperator;
+  model->operators.emplace(op_it, dequantize_op);
+
+  // Create a new intermediate tensor to connect the Dequantize op to the
+  // original op.
+  const string dequantized_output =
+      AvailableArrayName(*model, weights + "_dequantized");
+  Array& dequantized_output_array = model->GetOrCreateArray(dequantized_output);
+  dequantized_output_array.data_type = ArrayDataType::kFloat;
+
+  // Connect up the new Dequantize op with the weights and original op.
+  op->inputs[weights_index] = dequantized_output;
+  dequantize_op->inputs = {weights};
+  dequantize_op->outputs = {dequantized_output};
+
+  return true;
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
index 8dcd4adc90..95e8433be2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/BUILD
@@ -8,8 +8,8 @@ load(
 )
 
 tf_cc_test(
-    name = "resolve_constant_concatenation_test",
-    srcs = ["resolve_constant_concatenation_test.cc"],
+    name = "lstm_utils_test",
+    srcs = ["lstm_utils_test.cc"],
     deps = [
         "//tensorflow/contrib/lite/toco:graph_transformations",
         "//tensorflow/contrib/lite/toco:model",
@@ -19,8 +19,20 @@ tf_cc_test(
 )
 
 tf_cc_test(
-    name = "lstm_utils_test",
-    srcs = ["lstm_utils_test.cc"],
+    name = "quantize_weights_test",
+    srcs = ["quantize_weights_test.cc"],
+    deps = [
+        "//tensorflow/contrib/lite/toco:graph_transformations",
+        "//tensorflow/contrib/lite/toco:model",
+        "//tensorflow/contrib/lite/toco:tooling_util",
+        "@com_google_absl//absl/memory",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "resolve_constant_concatenation_test",
+    srcs = ["resolve_constant_concatenation_test.cc"],
     deps = [
         "//tensorflow/contrib/lite/toco:graph_transformations",
         "//tensorflow/contrib/lite/toco:model",
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc
new file mode 100644
index 0000000000..c05eb0929f
--- /dev/null
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/quantize_weights_test.cc
@@ -0,0 +1,167 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <math.h>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/memory/memory.h"
+#include "tensorflow/contrib/lite/toco/graph_transformations/graph_transformations.h"
+#include "tensorflow/contrib/lite/toco/model.h"
+#include "tensorflow/contrib/lite/toco/tooling_util.h"
+
+namespace toco {
+
+class QuantizeWeightsTest : public ::testing::Test {
+ protected:
+  QuantizeWeightsTest() {}
+
+  // The name of the weights input array.
+  const string kWeightsName = "weights";
+  // The zero_point of the values in the input array.
+  const int kZeroPoint = 128;
+
+  // Prepare a hypothetical TOCO model of a quantizable fully connected float
+  // layer.
+  void PrepareModel(Model* model, int elements_per_dim) {
+    std::vector<string> fc_input_names = {"inputs", kWeightsName};
+
+    const int kDim = 4;
+    const int buf_size = std::pow(elements_per_dim, static_cast<double>(kDim));
+    auto in_buf = absl::make_unique<float[]>(buf_size);
+    // Initialize the array with values from -128.0 to 127.0, since these values
+    // should be exactly representable by quantization.
+    for (int i = 0; i < buf_size; i++) {
+      in_buf[i] = static_cast<float>(i % 256 - kZeroPoint);
+    }
+
+    for (const string& fc_input_name : fc_input_names) {
+      Array& in_array = model->GetOrCreateArray(fc_input_name);
+      in_array.data_type = ArrayDataType::kFloat;
+
+      // Initialize shape for the input array.
+      Shape* in_array_shape = in_array.mutable_shape();
+      std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims();
+      in_array_shape_dim->resize(kDim, elements_per_dim);
+      auto& in_array_buffer =
+          in_array.GetMutableBuffer<ArrayDataType::kFloat>();
+      in_array_buffer.data.resize(buf_size);
+      float* buf_ptr =
+          in_array.GetMutableBuffer<ArrayDataType::kFloat>().data.data();
+      std::copy(in_buf.get(), in_buf.get() + buf_size, buf_ptr);
+    }
+
+    auto* fc_op = new FullyConnectedOperator;
+    fc_op->inputs = fc_input_names;
+    fc_op->outputs = {"fc_op_outputs"};
+    Array& out_array = model->GetOrCreateArray(fc_op->outputs[0]);
+    out_array.data_type = ArrayDataType::kFloat;
+    Shape* out_array_shape = out_array.mutable_shape();
+    std::vector<int>* out_array_shape_dim = out_array_shape->mutable_dims();
+    out_array_shape_dim->resize(kDim, elements_per_dim);
+    model->operators.push_back(std::unique_ptr<Operator>(fc_op));
+  }
+};
+
+TEST_F(QuantizeWeightsTest, QuantizedFullyConnected) {
+  // Test that weight arrays that are large enough are quantized.
+  Model model;
+  // 6 elements per dim gives us 1296 elements, which is sufficient to be
+  // quantized.
+  PrepareModel(&model, 6);
+
+  // Check the state of the graph before the transformation.
+  const auto& float_array_map = model.GetArrayMap();
+  EXPECT_EQ(float_array_map.size(), 3);
+  // Before the transformation, all arrays should be type float.
+  for (const auto& element : float_array_map) {
+    EXPECT_EQ(element.second->data_type, ArrayDataType::kFloat);
+  }
+  const std::vector<float> float_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data;
+
+  // Invoke the transformation.
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::QuantizeWeights);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+
+  // Check the state of the graph after the transformation.
+  const auto& quantized_array_map = model.GetArrayMap();
+  EXPECT_EQ(quantized_array_map.size(), 4);
+  // After the transformation, three arrays should be type float and one array
+  // should be uint8.
+  int num_float = 0;
+  int num_uint8 = 0;
+  for (const auto& element : quantized_array_map) {
+    if (element.second->data_type == ArrayDataType::kFloat) {
+      num_float++;
+    } else if (element.second->data_type == ArrayDataType::kUint8) {
+      num_uint8++;
+    } else {
+      FAIL() << "Unexpected array type.";
+    }
+  }
+  EXPECT_EQ(num_float, 3);
+  EXPECT_EQ(num_uint8, 1);
+  // Ensure that the values were quantized correctly.
+  const std::vector<uint8>& quantized_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kUint8>().data;
+  for (int i = 0; i < quantized_weight_vals.size(); i++) {
+    EXPECT_EQ(quantized_weight_vals[i], float_weight_vals[i] + kZeroPoint);
+  }
+
+  // Ensure that a Dequantize operator has been inserted before the
+  // FullyConnectedLayer.
+  EXPECT_EQ(model.operators[0]->type, OperatorType::kDequantize);
+}
+
+TEST_F(QuantizeWeightsTest, NotQuantizedFullyConnected) {
+  // Test that weight arrays that are too small are left untouched.
+  Model model;
+  // 5 elements per dim gives us 625 elements, which is NOT sufficient to be
+  // quantized.
+  PrepareModel(&model, 5);
+
+  // Check the state of the graph before the transformation.
+  const auto& float_array_map = model.GetArrayMap();
+  EXPECT_EQ(float_array_map.size(), 3);
+  // Before the transformation, all arrays should be type float.
+  for (auto it = float_array_map.begin(); it != float_array_map.end(); it++) {
+    EXPECT_EQ(it->second->data_type, ArrayDataType::kFloat);
+  }
+  std::vector<float> float_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data;
+
+  // Invoke the transformation.
+  GraphTransformationsSet graph_transformation_set;
+  graph_transformation_set.Add(new toco::QuantizeWeights);
+  (*graph_transformation_set.begin())->Run(&model, /*op_index=*/0);
+
+  // Check the state of the graph after the transformation.
+  const auto& post_array_map = model.GetArrayMap();
+  EXPECT_EQ(post_array_map.size(), 3);
+  for (auto it = post_array_map.begin(); it != post_array_map.end(); it++) {
+    EXPECT_EQ(it->second->data_type, ArrayDataType::kFloat);
+  }
+  // Ensure that the values remain unchanged.
+  std::vector<float> const& quantized_weight_vals =
+      model.GetArray(kWeightsName).GetBuffer<ArrayDataType::kFloat>().data;
+  for (int i = 0; i < quantized_weight_vals.size(); i++) {
+    EXPECT_EQ(quantized_weight_vals[i], float_weight_vals[i]);
+  }
+}
+
+}  // namespace toco
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
index 3a1d175b98..66cfed4ac2 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/tests/resolve_constant_concatenation_test.cc
@@ -12,9 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include <memory>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -126,7 +124,7 @@ class ResolveConstantConcatenationTest : public ::testing::Test {
       Array& in_array = model->GetOrCreateArray(concat_input_name);
       in_array.data_type = ArrayDataType::kFloat;
 
-      // Initialize shape for the input  array.
+      // Initialize shape for the input array.
       Shape* in_array_shape = in_array.mutable_shape();
       std::vector<int>* in_array_shape_dim = in_array_shape->mutable_dims();
       for (int i = 0; i < kDim; i++) {
diff --git a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
index 9c6ad673ab..87a1e429b9 100644
--- a/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/toco_cmdline_flags.cc
@@ -158,6 +158,11 @@ bool ParseTocoFlagsFromCommandLineFlags(
            parsed_flags.split_tflite_lstm_inputs.default_value(),
            "Split the LSTM inputs from 5 tensors to 18 tensors for TFLite. "
            "Ignored if the output format is not TFLite."),
+      Flag("quantize_weights", parsed_flags.quantize_weights.bind(),
+           parsed_flags.quantize_weights.default_value(),
+           "Store weights as quantized weights followed by dequantize "
+           "operations. Computation is still done in float, but reduces model "
+           "size (at the cost of accuracy and latency)."),
   };
   bool asked_for_help =
       *argc == 2 && (!strcmp(argv[1], "--help") || !strcmp(argv[1], "-help"));
@@ -251,6 +256,7 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
                  FlagRequirement::kNone);
   READ_TOCO_FLAG(dedupe_array_min_size_bytes, FlagRequirement::kNone);
   READ_TOCO_FLAG(split_tflite_lstm_inputs, FlagRequirement::kNone);
+  READ_TOCO_FLAG(quantize_weights, FlagRequirement::kNone);
 
   // Deprecated flag handling.
   if (parsed_toco_flags.input_type.specified()) {
@@ -284,6 +290,11 @@ void ReadTocoFlagsFromCommandLineFlags(const ParsedTocoFlags& parsed_toco_flags,
     QCHECK(toco::IODataType_Parse(input_types[0], &input_type));
     toco_flags->set_inference_input_type(input_type);
   }
+  if (parsed_toco_flags.quantize_weights.value()) {
+    QCHECK_NE(toco_flags->inference_type(), IODataType::QUANTIZED_UINT8)
+        << "quantize_weights is not supported with inference_type "
+           "QUANTIZED_UINT8.";
+  }
 
 #undef READ_TOCO_FLAG
 #undef PARSE_TOCO_FLAG
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 15f755c104..4fe57879fb 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -37,7 +37,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 20.
+// Next ID to use: 21.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -169,4 +169,9 @@ message TocoFlags {
   // Split the LSTM inputs from 5 tensors to 18 tensors for TFLite.
   // Ignored if the output format is not TFLite.
   optional bool split_tflite_lstm_inputs = 19 [default = true];
+
+  // Store weights as quantized weights followed by dequantize operations.
+  // Computation is still done in float, but reduces model size (at the cost of
+  // accuracy and latency).
+  optional bool quantize_weights = 20 [default = false];
 }
diff --git a/tensorflow/contrib/lite/toco/toco_tooling.cc b/tensorflow/contrib/lite/toco/toco_tooling.cc
index a648883d1f..1fe76f8163 100644
--- a/tensorflow/contrib/lite/toco/toco_tooling.cc
+++ b/tensorflow/contrib/lite/toco/toco_tooling.cc
@@ -269,6 +269,9 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
       transformations.Add(new toco::MergeLstmCellInputs);
     }
   }
+  if (toco_flags.quantize_weights()) {
+    transformations.Add(new QuantizeWeights);
+  }
   transformations.Add(new ResolveConstantConcatenation);
   RunGraphTransformations(model, "general graph transformations",
                           transformations);
-- 
GitLab


From d077fb3bcc0483f6326714161bb4b3f51a078332 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 1 Jun 2018 21:20:58 -0700
Subject: [PATCH 203/610] Replace boilerplate code with function template.

PiperOrigin-RevId: 198963930
---
 .../contrib/lite/toco/import_tensorflow.cc    | 561 ++----------------
 1 file changed, 64 insertions(+), 497 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 94ec7c24d4..0a57015d29 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -656,81 +656,6 @@ void ConvertRandomUniform(const NodeDef& node,
   model->operators.emplace_back(std::move(op));
 }
 
-void ConvertReluOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Relu");
-  CheckInputsCount(node, tf_import_flags, 1);
-  const auto& input_name = node.input(0);
-  auto* relu = new ReluOperator;
-  relu->inputs.push_back(input_name);
-  relu->outputs.push_back(node.name());
-  model->operators.emplace_back(relu);
-}
-
-void ConvertRelu6Operator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Relu6");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  const auto& input_name = node.input(0);
-  auto* op = new Relu6Operator;
-  op->inputs.push_back(input_name);
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertLogOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Log");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  auto op = absl::make_unique<LogOperator>();
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(std::move(op));
-}
-
-void ConvertLogisticOperator(const NodeDef& node,
-                             const TensorFlowImportFlags& tf_import_flags,
-                             Model* model) {
-  CHECK_EQ(node.op(), "Sigmoid");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  const auto& input_name = node.input(0);
-  auto* op = new LogisticOperator;
-  op->inputs.push_back(input_name);
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertTanhOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Tanh");
-  CheckInputsCount(node, tf_import_flags, 1);
-
-  const auto& input_name = node.input(0);
-  auto* op = new TanhOperator;
-  op->inputs.push_back(input_name);
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertDivOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK(node.op() == "Div" || node.op() == "RealDiv");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new DivOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertIdentityOperator(const NodeDef& node,
                              const TensorFlowImportFlags& tf_import_flags,
                              Model* model) {
@@ -787,38 +712,6 @@ void ConvertFakeQuantWithMinMaxVars(
   model->operators.emplace_back(op);
 }
 
-void ConvertNegOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Neg");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new NegOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertRsqrtOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Rsqrt");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowRsqrtOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSqrtOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Sqrt");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowSqrtOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertSqueezeOperator(const NodeDef& node,
                             const TensorFlowImportFlags& tf_import_flags,
@@ -840,66 +733,6 @@ void ConvertSqueezeOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertSquareOperator(const NodeDef& node,
-                           const TensorFlowImportFlags& tf_import_flags,
-                           Model* model) {
-  CHECK_EQ(node.op(), "Square");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowSquareOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertAddOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Add");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new AddOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertAddNOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "AddN");
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  auto* op = new AddNOperator;
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertMulOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Mul");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new MulOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSubOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Sub");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new SubOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertSumOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -915,67 +748,6 @@ void ConvertSumOperator(const NodeDef& node,
   }
 }
 
-void ConvertTileOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Tile");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowTileOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSliceOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Slice");
-  CheckInputsCount(node, tf_import_flags, 3);
-  auto* op = new SliceOperator;
-  for (int i = 0; i < 3; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertPadOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Pad");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new PadOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertPadV2Operator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "PadV2");
-  CheckInputsCount(node, tf_import_flags, 3);
-  auto* op = new PadV2Operator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->inputs.push_back(node.input(2));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertShapeOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Shape");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new TensorFlowShapeOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertSplitOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -993,18 +765,6 @@ void ConvertSplitOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertMergeOperator(const NodeDef& node,
-                          const TensorFlowImportFlags& tf_import_flags,
-                          Model* model) {
-  CHECK_EQ(node.op(), "Merge");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowMergeOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertSwitchOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
@@ -1034,18 +794,6 @@ void ConvertSoftmaxOperator(const NodeDef& node,
   model->operators.emplace_back(softmax);
 }
 
-void ConvertLogSoftmaxOperator(const NodeDef& node,
-                               const TensorFlowImportFlags& tf_import_flags,
-                               Model* model) {
-  CHECK_EQ(node.op(), "LogSoftmax");
-  CheckInputsCount(node, tf_import_flags, 1);
-  const auto& input_name = node.input(0);
-  auto* log_softmax = new LogSoftmaxOperator;
-  log_softmax->inputs.push_back(input_name);
-  log_softmax->outputs.push_back(node.name());
-  model->operators.emplace_back(log_softmax);
-}
-
 void ConvertLRNOperator(const NodeDef& node,
                         const TensorFlowImportFlags& tf_import_flags,
                         Model* model) {
@@ -1142,17 +890,6 @@ void ConvertAvgPoolOperator(const NodeDef& node,
   model->operators.emplace_back(avgpool);
 }
 
-void ConvertReshapeOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Reshape");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowReshapeOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertBatchMatMulOperator(const NodeDef& node,
                                 const TensorFlowImportFlags& tf_import_flags,
@@ -1215,24 +952,12 @@ void ConvertConcatOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertAllOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "All");
-  auto* op = new TensorFlowAllOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertAssertOperator(const NodeDef& node,
+// This method supports simple operators without additional attributes.
+template <typename Op>
+void ConvertSimpleOperator(const NodeDef& node,
                            const TensorFlowImportFlags& tf_import_flags,
                            Model* model) {
-  CHECK_EQ(node.op(), "Assert");
-  auto* op = new TensorFlowAssertOperator;
+  auto* op = new Op;
   const int num_inputs = GetInputsCount(node, tf_import_flags);
   for (int i = 0; i < num_inputs; ++i) {
     op->inputs.push_back(node.input(i));
@@ -1241,69 +966,13 @@ void ConvertAssertOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertLessOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Less");
-  auto* op = new TensorFlowLessOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertLessEqualOperator(const NodeDef& node,
-                              const TensorFlowImportFlags& tf_import_flags,
-                              Model* model) {
-  CHECK_EQ(node.op(), "LessEqual");
-  auto* op = new TensorFlowLessEqualOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertSinOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Sin");
-  auto* op = new SinOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertGreaterOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Greater");
-  auto* op = new TensorFlowGreaterOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertGreaterEqualOperator(const NodeDef& node,
-                                 const TensorFlowImportFlags& tf_import_flags,
-                                 Model* model) {
-  CHECK_EQ(node.op(), "GreaterEqual");
-  auto* op = new TensorFlowGreaterEqualOperator;
-  const int num_inputs = GetInputsCount(node, tf_import_flags);
-  for (int i = 0; i < num_inputs; ++i) {
-    op->inputs.push_back(node.input(i));
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
+// This method supports simple operators without additional attributes.
+template <typename Op, unsigned int NumInputs>
+void ConvertSimpleOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
+  CheckInputsCount(node, tf_import_flags, NumInputs);
+  ConvertSimpleOperator<Op>(node, tf_import_flags, model);
 }
 
 void ConvertMaxOperator(const NodeDef& node,
@@ -1336,29 +1005,6 @@ void ConvertMinOperator(const NodeDef& node,
   }
 }
 
-void ConvertMaximumOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Maximum");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowMaximumOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertMinimumOperator(const NodeDef& node,
-                            const TensorFlowImportFlags& tf_import_flags,
-                            Model* model) {
-  CHECK_EQ(node.op(), "Minimum");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TensorFlowMinimumOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertUnsupportedOperator(const NodeDef& node,
                                 const TensorFlowImportFlags& tf_import_flags,
@@ -1387,19 +1033,6 @@ void ConvertUnsupportedOperator(const NodeDef& node,
   }
 }
 
-void ConvertSelectOperator(const NodeDef& node,
-                           const TensorFlowImportFlags& tf_import_flags,
-                           Model* model) {
-  CheckInputsCount(node, tf_import_flags, 3);
-
-  auto* op = new SelectOperator;
-  for (const auto& input : node.input()) {
-    op->inputs.push_back(input);
-  }
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertStridedSliceOperator(const NodeDef& node,
                                  const TensorFlowImportFlags& tf_import_flags,
                                  Model* model) {
@@ -1678,17 +1311,6 @@ void ConvertBatchToSpaceNDOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertExpOperator(const NodeDef& node,
-                        const TensorFlowImportFlags& tf_import_flags,
-                        Model* model) {
-  CHECK_EQ(node.op(), "Exp");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new ExpOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertMeanOperator(const NodeDef& node,
                          const TensorFlowImportFlags& tf_import_flags,
                          Model* model) {
@@ -1802,53 +1424,6 @@ void ConvertTransposeConvOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertExpandDimsOperator(const NodeDef& node,
-                               const TensorFlowImportFlags& tf_import_flags,
-                               Model* model) {
-  CHECK_EQ(node.op(), "ExpandDims");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new ExpandDimsOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertFillOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Fill");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new FillOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertFloorDivOperator(const NodeDef& node,
-                             const TensorFlowImportFlags& tf_import_flags,
-                             Model* model) {
-  CHECK_EQ(node.op(), "FloorDiv");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new FloorDivOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
-void ConvertFloorModOperator(const NodeDef& node,
-                             const TensorFlowImportFlags& tf_import_flags,
-                             Model* model) {
-  CHECK_EQ(node.op(), "FloorMod");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new FloorModOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 void ConvertRangeOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
@@ -1869,17 +1444,6 @@ void ConvertRangeOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertRankOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
-  CHECK_EQ(node.op(), "Rank");
-  CheckInputsCount(node, tf_import_flags, 1);
-  auto* op = new RankOperator;
-  op->inputs.push_back(node.input(0));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
-
 void ConvertStackOperator(const NodeDef& node,
                           const TensorFlowImportFlags& tf_import_flags,
                           Model* model) {
@@ -1900,17 +1464,6 @@ void ConvertStackOperator(const NodeDef& node,
   model->operators.emplace_back(op);
 }
 
-void ConvertTransposeOperator(const NodeDef& node,
-                              const TensorFlowImportFlags& tf_import_flags,
-                              Model* model) {
-  CHECK_EQ(node.op(), "Transpose");
-  CheckInputsCount(node, tf_import_flags, 2);
-  auto* op = new TransposeOperator;
-  op->inputs.push_back(node.input(0));
-  op->inputs.push_back(node.input(1));
-  op->outputs.push_back(node.name());
-  model->operators.emplace_back(op);
-}
 
 // Some TensorFlow ops only occur in graph cycles, representing
 // control flow. We do not currently support control flow, so we wouldn't
@@ -2174,25 +1727,26 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
   } else if (node.op() == "BiasAdd") {
     ConvertBiasAddOperator(node, tf_import_flags, model);
   } else if (node.op() == "Relu") {
-    ConvertReluOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<ReluOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Relu6") {
-    ConvertRelu6Operator(node, tf_import_flags, model);
+    ConvertSimpleOperator<Relu6Operator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Sigmoid") {
-    ConvertLogisticOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<LogisticOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Tanh") {
-    ConvertTanhOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TanhOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "MaxPool") {
     ConvertMaxPoolOperator(node, tf_import_flags, model);
   } else if (node.op() == "AvgPool") {
     ConvertAvgPoolOperator(node, tf_import_flags, model);
   } else if (node.op() == "Reshape") {
-    ConvertReshapeOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowReshapeOperator, 2>(node, tf_import_flags,
+                                                        model);
   } else if (node.op() == "BatchMatMul") {
     ConvertBatchMatMulOperator(node, tf_import_flags, model);
   } else if (node.op() == "MatMul") {
     ConvertMatMulOperator(node, tf_import_flags, model);
   } else if (node.op() == "Div" || node.op() == "RealDiv") {
-    ConvertDivOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<DivOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "Identity" || node.op() == "CheckNumerics" ||
              node.op() == "StopGradient") {
     ConvertIdentityOperator(node, tf_import_flags, model);
@@ -2201,27 +1755,31 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
   } else if (node.op() == "FakeQuantWithMinMaxArgs") {
     ConvertFakeQuantWithMinMaxArgs(node, tf_import_flags, model);
   } else if (node.op() == "Neg") {
-    ConvertNegOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<NegOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Rsqrt") {
-    ConvertRsqrtOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowRsqrtOperator, 1>(node, tf_import_flags,
+                                                      model);
   } else if (node.op() == "Squeeze") {
     ConvertSqueezeOperator(node, tf_import_flags, model);
   } else if (node.op() == "Sqrt") {
-    ConvertSqrtOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowSqrtOperator, 1>(node, tf_import_flags,
+                                                     model);
   } else if (node.op() == "Square") {
-    ConvertSquareOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowSquareOperator, 1>(node, tf_import_flags,
+                                                       model);
   } else if (node.op() == "Add") {
-    ConvertAddOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<AddOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "AddN") {
-    ConvertAddNOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<AddNOperator>(node, tf_import_flags, model);
   } else if (node.op() == "Mul") {
-    ConvertMulOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<MulOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "Sub") {
-    ConvertSubOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<SubOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "Sum") {
     ConvertSumOperator(node, tf_import_flags, model);
   } else if (node.op() == "Tile") {
-    ConvertTileOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowTileOperator, 2>(node, tf_import_flags,
+                                                     model);
   } else if (node.op() == "Concat" || node.op() == "ConcatV2") {
     ConvertConcatOperator(node, tf_import_flags, model);
   } else if (node.op() == "LRN") {
@@ -2229,41 +1787,50 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
   } else if (node.op() == "Softmax") {
     ConvertSoftmaxOperator(node, tf_import_flags, model);
   } else if (node.op() == "Log") {
-    ConvertLogOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<LogOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "LogSoftmax") {
-    ConvertLogSoftmaxOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<LogSoftmaxOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "All") {
-    ConvertAllOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowAllOperator>(node, tf_import_flags, model);
   } else if (node.op() == "Assert") {
-    ConvertAssertOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowAssertOperator>(node, tf_import_flags,
+                                                    model);
   } else if (node.op() == "Less") {
-    ConvertLessOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowLessOperator, 2>(node, tf_import_flags,
+                                                     model);
   } else if (node.op() == "LessEqual") {
-    ConvertLessEqualOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>(node, tf_import_flags,
+                                                          model);
   } else if (node.op() == "Greater") {
-    ConvertGreaterOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowGreaterOperator, 2>(node, tf_import_flags,
+                                                        model);
   } else if (node.op() == "GreaterEqual") {
-    ConvertGreaterEqualOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>(
+        node, tf_import_flags, model);
   } else if (node.op() == "Max") {
     ConvertMaxOperator(node, tf_import_flags, model);
   } else if (node.op() == "Min") {
     ConvertMinOperator(node, tf_import_flags, model);
   } else if (node.op() == "Maximum") {
-    ConvertMaximumOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowMaximumOperator, 2>(node, tf_import_flags,
+                                                        model);
   } else if (node.op() == "Minimum") {
-    ConvertMinimumOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowMinimumOperator, 2>(node, tf_import_flags,
+                                                        model);
   } else if (node.op() == "Merge") {
-    ConvertMergeOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowMergeOperator, 2>(node, tf_import_flags,
+                                                      model);
   } else if (node.op() == "Pad") {
-    ConvertPadOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<PadOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "PadV2") {
-    ConvertPadV2Operator(node, tf_import_flags, model);
+    ConvertSimpleOperator<PadV2Operator, 3>(node, tf_import_flags, model);
   } else if (node.op() == "StridedSlice") {
     ConvertStridedSliceOperator(node, tf_import_flags, model);
   } else if (node.op() == "Shape") {
-    ConvertShapeOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TensorFlowShapeOperator, 1>(node, tf_import_flags,
+                                                      model);
   } else if (node.op() == "Slice") {
-    ConvertSliceOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<SliceOperator, 3>(node, tf_import_flags, model);
   } else if (node.op() == "Split") {
     ConvertSplitOperator(node, tf_import_flags, model);
   } else if (node.op() == "Switch") {
@@ -2300,25 +1867,25 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
   } else if (node.op() == "NextIteration") {
     ConvertOperatorSpecialCasedAsRNNBackEdge(node, tf_import_flags, model);
   } else if (node.op() == "ExpandDims") {
-    ConvertExpandDimsOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<ExpandDimsOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "Fill") {
-    ConvertFillOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<FillOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "FloorDiv") {
-    ConvertFloorDivOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<FloorDivOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "FloorMod") {
-    ConvertFloorModOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<FloorModOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "Range") {
     ConvertRangeOperator(node, tf_import_flags, model);
   } else if (node.op() == "Rank") {
-    ConvertRankOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<RankOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Stack" || node.op() == "Pack") {
     ConvertStackOperator(node, tf_import_flags, model);
   } else if (node.op() == "Transpose") {
-    ConvertTransposeOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<TransposeOperator, 2>(node, tf_import_flags, model);
   } else if (node.op() == "ArgMax") {
     ConvertArgMaxOperator(node, tf_import_flags, model);
   } else if (node.op() == "Exp") {
-    ConvertExpOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<ExpOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "TopK" || node.op() == "TopKV2") {
     ConvertTopKV2Operator(node, tf_import_flags, model);
   } else if (node.op() == "DynamicPartition") {
@@ -2329,9 +1896,9 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
   } else if (node.op() == "RandomUniform") {
     ConvertRandomUniform(node, tf_import_flags, model);
   } else if (node.op() == "Sin") {
-    ConvertSinOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<SinOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Select") {
-    ConvertSelectOperator(node, tf_import_flags, model);
+    ConvertSimpleOperator<SelectOperator, 3>(node, tf_import_flags, model);
   } else if (node.op() == "SparseToDense") {
     ConvertSparseToDenseOperator(node, tf_import_flags, model);
   } else {
-- 
GitLab


From 14daf02aed8d54d14c0b235fe331e3757a0640df Mon Sep 17 00:00:00 2001
From: Loo Rong Jie <loorongjie@gmail.com>
Date: Sat, 2 Jun 2018 12:29:12 +0800
Subject: [PATCH 204/610] [XLA] Explicitly use ::xla::Layout

MSVC uses delayed template parsing, so it confuses `Layout` as
`::xla::match::Layout` below instead of `::xla::Layout`.
---
 tensorflow/compiler/xla/service/pattern_matcher.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index d3bc47e61e..2515222cf2 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -204,7 +204,7 @@ class LayoutPattern {
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>> EqualTo(
-      const Layout* layout) const {
+      const ::xla::Layout* layout) const {
     return LayoutPattern<LayoutType, LayoutPatternEqualImpl<Impl>>(
         LayoutPatternEqualImpl<Impl>(impl_, layout), matched_layout_);
   }
-- 
GitLab


From 0303c029d99c4080a3929a8320d9972cc4b973d5 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 2 Jun 2018 15:28:04 +0000
Subject: [PATCH 205/610] Remove duplicate imports

Inside ffmpeg/__init__.py the last import line:
```
from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
```
is a duplicate of the previous import. This fix removes the duplicate.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/ffmpeg/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/ffmpeg/__init__.py b/tensorflow/contrib/ffmpeg/__init__.py
index daba965a98..484ffee3e7 100644
--- a/tensorflow/contrib/ffmpeg/__init__.py
+++ b/tensorflow/contrib/ffmpeg/__init__.py
@@ -28,7 +28,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_audio
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 from tensorflow.contrib.ffmpeg.ffmpeg_ops import encode_audio
-from tensorflow.contrib.ffmpeg.ffmpeg_ops import decode_video
 
 from tensorflow.python.util.all_util import remove_undocumented
 
-- 
GitLab


From 72307dfb415e44d95bf72850bff7b7106385cda0 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 2 Jun 2018 15:29:59 +0000
Subject: [PATCH 206/610] Remove duplicate import of gen_decode_video_op_py

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/ffmpeg/ffmpeg_ops.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
index 020b5c99c6..b1b5126d9e 100644
--- a/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
+++ b/tensorflow/contrib/ffmpeg/ffmpeg_ops.py
@@ -21,7 +21,6 @@ from __future__ import print_function
 from tensorflow.contrib.ffmpeg.ops import gen_decode_audio_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.ffmpeg.ops import gen_encode_audio_op_py
-from tensorflow.contrib.ffmpeg.ops import gen_decode_video_op_py
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
-- 
GitLab


From a06e521204d7b5a2dd27de44efbab352ff918aa7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sat, 2 Jun 2018 12:35:32 -0700
Subject: [PATCH 207/610] Adding support for the int() and float() built-ins.

PiperOrigin-RevId: 199001807
---
 .../autograph/converters/builtin_functions.py |  2 +-
 tensorflow/contrib/autograph/utils/BUILD      |  2 ++
 .../contrib/autograph/utils/builtins.py       | 23 ++++++++++++++++++-
 .../contrib/autograph/utils/builtins_test.py  | 17 +++++++++++++-
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/builtin_functions.py b/tensorflow/contrib/autograph/converters/builtin_functions.py
index 46e39da16a..231e4ee35a 100644
--- a/tensorflow/contrib/autograph/converters/builtin_functions.py
+++ b/tensorflow/contrib/autograph/converters/builtin_functions.py
@@ -48,7 +48,7 @@ class BuiltinFunctionTransformer(transformer.Base):
     # TODO(mdan): This won't work if the function was hidden.
     # TODO(mdan): Rely on the live_val and use inspect_utils.is_builtin instead.
     if (isinstance(node.func, gast.Name) and
-        node.func.id in ('len', 'range', 'xrange')):
+        node.func.id in ('len', 'range', 'xrange', 'float', 'int')):
       return self._convert_builtin(node)
     # Print needs to be handled separately because it can be read as statement.
     if isinstance(node.func, gast.Name) and node.func.id == 'print':
diff --git a/tensorflow/contrib/autograph/utils/BUILD b/tensorflow/contrib/autograph/utils/BUILD
index d3a1b94688..d82c17bf2a 100644
--- a/tensorflow/contrib/autograph/utils/BUILD
+++ b/tensorflow/contrib/autograph/utils/BUILD
@@ -33,6 +33,8 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:list_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python/data/ops:dataset_ops",
diff --git a/tensorflow/contrib/autograph/utils/builtins.py b/tensorflow/contrib/autograph/utils/builtins.py
index 211e8eaee9..998087e056 100644
--- a/tensorflow/contrib/autograph/utils/builtins.py
+++ b/tensorflow/contrib/autograph/utils/builtins.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.contrib.autograph.utils import py_func
 from tensorflow.contrib.autograph.utils import type_check
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
@@ -38,7 +39,13 @@ def dynamic_builtin(f, *args, **kwargs):
     return dynamic_range(*args, **kwargs)
   if f is range:
     return dynamic_range(*args, **kwargs)
-  raise ValueError('%s is not supported' % f)
+  if f is int:
+    return dynamic_int(*args, **kwargs)
+  if f is float:
+    return dynamic_float(*args, **kwargs)
+
+  raise NotImplementedError(
+      'The "%s" builtin is not yet supported.' % f.__name__)
 
 
 def dynamic_len(list_or_tensor):
@@ -52,6 +59,20 @@ def dynamic_len(list_or_tensor):
   return len(list_or_tensor)
 
 
+def dynamic_int(num_or_tensor, **kwargs):
+  """Implementation of int() using dynamic dispatch."""
+  if tensor_util.is_tensor(num_or_tensor):
+    return math_ops.cast(num_or_tensor, dtype=dtypes.int32, **kwargs)
+  return int(num_or_tensor)
+
+
+def dynamic_float(num_or_tensor, **kwargs):
+  """Implementation of float() using dynamic dispatch."""
+  if tensor_util.is_tensor(num_or_tensor):
+    return math_ops.cast(num_or_tensor, dtype=dtypes.float32, **kwargs)
+  return float(num_or_tensor)
+
+
 def dynamic_range(start_or_stop, stop=None, step=None):
   """Implementation of range using dynamic dispatch."""
   if type_check.is_tensor(start_or_stop, stop, step):
diff --git a/tensorflow/contrib/autograph/utils/builtins_test.py b/tensorflow/contrib/autograph/utils/builtins_test.py
index 163e698407..0c2312178a 100644
--- a/tensorflow/contrib/autograph/utils/builtins_test.py
+++ b/tensorflow/contrib/autograph/utils/builtins_test.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.contrib.autograph.utils import builtins
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import test
 
 
@@ -77,7 +78,7 @@ class BuiltinsTest(test.TestCase):
       return x
 
     # Functions that just have the names of builtins are rejected.
-    with self.assertRaises(ValueError):
+    with self.assertRaises(NotImplementedError):
       self.assertEqual(builtins.dynamic_builtin(range, 1), 1)
     if six.PY2:
       self.assertListEqual(
@@ -87,6 +88,20 @@ class BuiltinsTest(test.TestCase):
     self.assertListEqual(
         list(builtins.dynamic_builtin(six.moves.xrange, 3)), [0, 1, 2])
 
+  def test_casts(self):
+    i = constant_op.constant(2, dtype=dtypes.int32)
+    f = constant_op.constant(1.0, dtype=dtypes.float32)
+
+    self.assertEqual(builtins.dynamic_builtin(int, i).dtype, dtypes.int32)
+    self.assertEqual(builtins.dynamic_builtin(int, f).dtype, dtypes.int32)
+    self.assertEqual(builtins.dynamic_builtin(float, i).dtype, dtypes.float32)
+    self.assertEqual(builtins.dynamic_builtin(float, f).dtype, dtypes.float32)
+
+    self.assertEqual(builtins.dynamic_builtin(int, True), 1)
+    self.assertEqual(builtins.dynamic_builtin(int, False), 0)
+    self.assertEqual(builtins.dynamic_builtin(float, True), 1.0)
+    self.assertEqual(builtins.dynamic_builtin(float, False), 0.0)
+
   def test_dynamic_print_tf(self):
     try:
       out_capturer = six.StringIO()
-- 
GitLab


From d23f115d89ad6111674f53135d669cb2d2c086f0 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Sat, 2 Jun 2018 14:06:14 -0700
Subject: [PATCH 208/610] Don't cluster Identity nodes that forward tensor refs

XLA cannot implement the forward-tensor-ref semantic -- there is no guaranteed
aliasing between the input and output of the XLA cluster.

PiperOrigin-RevId: 199005227
---
 .../compiler/jit/mark_for_compilation_pass.cc | 26 ++++++++++
 .../jit/mark_for_compilation_pass_test.cc     | 47 +++++++++++++++++++
 2 files changed, 73 insertions(+)

diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 8e2ee0f1d7..07ee93d79e 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -46,6 +46,12 @@ const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
 
 namespace {
 
+// Returns true if, when executed in TensorFlow, `node` is guaranteed to forward
+// a ref tensor input to its output.
+static bool AlwaysForwardsRefInput(const Node& node) {
+  return node.IsIdentity();
+}
+
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
@@ -60,6 +66,26 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
       return false;
     }
   }
+
+  // XLA does not offer guaranteed aliasing between the input and output of the
+  // XLA cluster so it can't implement the forward-tensor-ref semantic.  Leave
+  // such nodes out of XLA clusters.
+  if (AlwaysForwardsRefInput(node)) {
+    for (const Edge* incoming_edge : node.in_edges()) {
+      if (incoming_edge->IsControlEdge()) {
+        continue;
+      }
+
+      Node* incoming_node = incoming_edge->src();
+      if (IsRefType(incoming_node->output_type(incoming_edge->src_output()))) {
+        VLOG(2) << "Not clustering " << node.def().ShortDebugString()
+                << " because of ref input " << incoming_node->name() << " "
+                << incoming_node->type_string();
+        return false;
+      }
+    }
+  }
+
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
 }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 703d8825d7..772c92d369 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -633,5 +633,52 @@ TEST(XlaCompilationTest, ConstOp) {
   }
 }
 
+TEST(XlaCompilationTest, DontClusterIdentityWithRefInput) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output variable = ops::Variable(root.WithOpName("variable"),
+                                  PartialTensorShape{}, DT_FLOAT);
+  Output read = ops::Identity(root.WithOpName("read"), variable);
+  Output neg = ops::Negate(root.WithOpName("negate"), read);
+  Output add = ops::Add(root.WithOpName("add"), neg, neg);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  ASSERT_FALSE(clusters.empty());
+  string cluster_name = clusters.begin()->second;
+
+  std::unordered_map<string, string> expected_clusters(
+      {{"negate", cluster_name}, {"add", cluster_name}});
+  EXPECT_EQ(clusters, expected_clusters);
+}
+
+TEST(XlaCompilationTest, ClusterIdentityWithNonRefInput) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output variable = ops::Variable(root.WithOpName("variable"),
+                                  PartialTensorShape{}, DT_FLOAT);
+  Output read = ops::Identity(root.WithOpName("read"), variable);
+  Output neg = ops::Negate(root.WithOpName("negate"), read);
+  Output identity = ops::Negate(root.WithOpName("identity"), neg);
+  Output add = ops::Add(root.WithOpName("add"), identity, neg);
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+  TF_ASSERT_OK(MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+
+  ASSERT_FALSE(clusters.empty());
+  string cluster_name = clusters.begin()->second;
+
+  std::unordered_map<string, string> expected_clusters(
+      {{"negate", cluster_name},
+       {"identity", cluster_name},
+       {"add", cluster_name}});
+  EXPECT_EQ(clusters, expected_clusters);
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From 5cc568290d9039e360e5705aeee64ed24984b9e7 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 24 May 2018 21:20:41 +0000
Subject: [PATCH 209/610] Add complex numbers to the supported data types for
 UnsortedSegmentProd

In the kernel implementation both UnsortedSegmentProd and UnsortedSegmentSum
supports complex numbers. However, unlike UnsortedSegmentSum, the op
of UnsortedSegmentProd does not register complex number types in math_ops.cc.

This fix adds the supported complex number types to math_ops.cc,
and enables test cases for it.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8c0b073ce4..929213656c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -1080,7 +1080,7 @@ REGISTER_OP("UnsortedSegmentProd")
     .Input("segment_ids: Tindices")
     .Input("num_segments: Tnumsegments")
     .Output("output: T")
-    .Attr("T: realnumbertype")
+    .Attr("T: numbertype")
     .Attr("Tindices: {int32,int64}")
     .Attr("Tnumsegments: {int32,int64} = DT_INT32")
     .SetShapeFn(UnsortedSegmentReductionShapeFn);
-- 
GitLab


From 32b6cb87a349bb6b2866a6ae2f2c24dcd3ad738f Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Thu, 24 May 2018 21:23:33 +0000
Subject: [PATCH 210/610] Enable test case for complex number types with
 unsorted_segment_prod

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/segment_reduction_ops_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 794be096b7..b3e1e8bec5 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -263,8 +263,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                       math_ops.unsorted_segment_max, lambda t: t.min)]
 
     # A subset of ops has been enabled for complex numbers
-    self.complex_ops_list = [(np.add, None,
-                              math_ops.unsorted_segment_sum, lambda t: 0)]
+    self.complex_ops_list = [(np.add, None, math_ops.unsorted_segment_sum, lambda t: 0), (np.ndarray.__mul__, None, math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
-- 
GitLab


From 51d8cc8bff7c4455ee8054240facf44da846e492 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sat, 2 Jun 2018 21:57:32 +0000
Subject: [PATCH 211/610] Pylint fix

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/segment_reduction_ops_test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index b3e1e8bec5..a82855dfeb 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -263,7 +263,10 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                       math_ops.unsorted_segment_max, lambda t: t.min)]
 
     # A subset of ops has been enabled for complex numbers
-    self.complex_ops_list = [(np.add, None, math_ops.unsorted_segment_sum, lambda t: 0), (np.ndarray.__mul__, None, math_ops.unsorted_segment_prod, lambda t: 1)]
+    self.complex_ops_list = [(np.add, None,
+                              math_ops.unsorted_segment_sum, lambda t: 0),
+                             (np.ndarray.__mul__, None,
+                              math_ops.unsorted_segment_prod, lambda t: 1)]
     self.differentiable_dtypes = [dtypes_lib.float16, dtypes_lib.float32,
                                   dtypes_lib.float64]
     self.all_dtypes = (self.differentiable_dtypes +
-- 
GitLab


From 18526a0d2f85c32269d40e621a492759bee3aaf2 Mon Sep 17 00:00:00 2001
From: Karan Kaw <karankaw@hotmail.com>
Date: Sun, 3 Jun 2018 13:37:45 +0530
Subject: [PATCH 212/610] Mentioned Visual C++ 2015 dependency for Windows JNI
 library

---
 tensorflow/docs_src/install/install_java.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index 1256fb99c4..bbbabb6086 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -181,7 +181,7 @@ Take the following steps to install TensorFlow for Java on Windows:
      [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
-
+__Note__: Please ensure that _MS Visual C++ 2015 Redistributable_ package is installed on Windows system as tensorflow JNI library (*tensorflow_jni.dll*) uses them at runtime.
 
 ### Validate the installation
 
-- 
GitLab


From c045937787d6dd221e0fac0f040d7bf68b2101be Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Jun 2018 15:11:45 +0000
Subject: [PATCH 213/610] Add int16 support for `tf.as_string`

In `tf.as_string`, integers are mostly supported (`int8`, `int32`, `int64`)
but not `int16`. This fix adds the `int16` support for `tf.as_string`.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/kernels/as_string_op.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/kernels/as_string_op.cc b/tensorflow/core/kernels/as_string_op.cc
index 66c4aff3e3..a7757d1361 100644
--- a/tensorflow/core/kernels/as_string_op.cc
+++ b/tensorflow/core/kernels/as_string_op.cc
@@ -73,6 +73,7 @@ class AsStringOp : public OpKernel {
     }
     switch (dtype) {
       case DT_INT8:
+      case DT_INT16:
       case DT_INT32:
         strings::Appendf(&format_, "d");
         break;
@@ -129,6 +130,7 @@ class AsStringOp : public OpKernel {
       ENCODE_TYPE(DT_FLOAT, float, format_);
       ENCODE_TYPE(DT_DOUBLE, double, format_);
       ENCODE_TYPE(DT_INT8, int8, format_);
+      ENCODE_TYPE(DT_INT16, int16, format_);
       case (DT_BOOL): {
         const auto& input_flat = input_tensor->flat<bool>();
         for (int i = 0; i < input_flat.size(); ++i) {
-- 
GitLab


From 56666ab5b3d807e4b070c4035e74d645f11ae817 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Jun 2018 15:14:21 +0000
Subject: [PATCH 214/610] Register int16 as supported ops for AsString in
 string_ops.cc

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/string_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 1d5c743a56..03bd4994bd 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -78,7 +78,7 @@ REGISTER_OP("ReduceJoin")
 REGISTER_OP("AsString")
     .Input("input: T")
     .Output("output: string")
-    .Attr("T: {int32, int64, complex64, float, double, bool, int8}")
+    .Attr("T: {int8, int16, int32, int64, complex64, float, double, bool}")
     .Attr("precision: int = -1")
     .Attr("scientific: bool = false")
     .Attr("shortest: bool = false")
-- 
GitLab


From 82bedc89eb3a865ff56577822828a1c30105aff3 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Jun 2018 15:14:48 +0000
Subject: [PATCH 215/610] Add test cases for int16 support of `tf.as_string`

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/kernel_tests/as_string_op_test.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index 9d54add264..94ed8ebd31 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -130,6 +130,16 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  def testHalfInt(self):
+    s = lambda strs: [x.decode("ascii") for x in strs]
+
+    with self.test_session():
+      input_ = array_ops.placeholder(dtypes.int16)
+      int_inputs_ = [np.iinfo(np.int16).min, np.iinfo(np.int16).max]
+      output = string_ops.as_string(input_)
+      result = output.eval(feed_dict={input_: int_inputs_})
+      self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
+
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
-- 
GitLab


From d836210e7d7c8bf54676fd4154f40920310cdb27 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Sun, 3 Jun 2018 12:08:00 -0700
Subject: [PATCH 216/610] Re-Merge accidentally reverted change (#19727)

* Add IBM ppc64le build to README.

* ppc64le -> ppc64le CPU
-- 
GitLab


From 45198062b58245711d7446aa389f3b9aa2c1535f Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Sun, 3 Jun 2018 12:43:16 -0700
Subject: [PATCH 217/610] New NN API interface that uses the TensorFlow Lite
 delegate API.

- Make nn_api a delegate in its own directory.
- Use the delegate API to rewrite the graph.
- Use only on static APIs right now.
- This is initial preview of the delegate that only supports add and conv.

PiperOrigin-RevId: 199055747
---
 tensorflow/contrib/lite/BUILD                 |  10 +
 tensorflow/contrib/lite/context_util.h        |  48 ++
 tensorflow/contrib/lite/delegates/nnapi/BUILD |  31 ++
 .../lite/delegates/nnapi/nnapi_delegate.cc    | 464 ++++++++++++++++++
 .../lite/delegates/nnapi/nnapi_delegate.h     |  31 ++
 .../delegates/nnapi/nnapi_delegate_test.cc    |  82 ++++
 tensorflow/contrib/lite/kernels/test_util.cc  |   6 +
 tensorflow/contrib/lite/kernels/test_util.h   |  10 +
 8 files changed, 682 insertions(+)
 create mode 100644 tensorflow/contrib/lite/context_util.h
 create mode 100644 tensorflow/contrib/lite/delegates/nnapi/BUILD
 create mode 100644 tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
 create mode 100644 tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
 create mode 100644 tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc

diff --git a/tensorflow/contrib/lite/BUILD b/tensorflow/contrib/lite/BUILD
index 55b984f260..9c804d2785 100644
--- a/tensorflow/contrib/lite/BUILD
+++ b/tensorflow/contrib/lite/BUILD
@@ -90,6 +90,16 @@ cc_library(
     deps = [":context"],
 )
 
+cc_library(
+    name = "kernel_api",
+    hdrs = [
+        "builtin_op_data.h",
+        "builtin_ops.h",
+        "context.h",
+        "context_util.h",
+    ],
+)
+
 exports_files(["builtin_ops.h"])
 
 cc_library(
diff --git a/tensorflow/contrib/lite/context_util.h b/tensorflow/contrib/lite/context_util.h
new file mode 100644
index 0000000000..abe802e342
--- /dev/null
+++ b/tensorflow/contrib/lite/context_util.h
@@ -0,0 +1,48 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// This provides a few C++ helpers that are useful for manipulating C structures
+// in C++.
+#ifndef TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
+#define TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Provide a range iterable wrapper for TfLiteIntArray* (C lists that TfLite
+// C api uses. Can't use the google array_view, since we can't depend on even
+// absl for embedded device reasons.
+class TfLiteIntArrayView {
+ public:
+  // Construct a view of a TfLiteIntArray*. Note, `int_array` should be non-null
+  // and this view does not take ownership of it.
+  explicit TfLiteIntArrayView(const TfLiteIntArray* int_array)
+      : int_array_(int_array) {}
+
+  TfLiteIntArrayView(const TfLiteIntArrayView&) = default;
+  TfLiteIntArrayView& operator=(const TfLiteIntArrayView& rhs) = default;
+
+  typedef const int* const_iterator;
+  const_iterator begin() const { return int_array_->data; }
+  const_iterator end() const { return &int_array_->data[int_array_->size]; }
+  size_t size() const { return end() - begin(); }
+
+ private:
+  const TfLiteIntArray* int_array_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_CONTEXT_UTIL_H_
diff --git a/tensorflow/contrib/lite/delegates/nnapi/BUILD b/tensorflow/contrib/lite/delegates/nnapi/BUILD
new file mode 100644
index 0000000000..35a8f6ca41
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/BUILD
@@ -0,0 +1,31 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+licenses(["notice"])  # Apache 2.0
+
+cc_library(
+    name = "nnapi_delegate",
+    srcs = ["nnapi_delegate.cc"],
+    hdrs = ["nnapi_delegate.h"],
+    deps = [
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:kernel_api",
+        "//tensorflow/contrib/lite/kernels:kernel_util",
+        "//tensorflow/contrib/lite/nnapi:nnapi_lib",
+    ],
+)
+
+tf_cc_test(
+    name = "nnapi_delegate_test",
+    size = "small",
+    srcs = ["nnapi_delegate_test.cc"],
+    deps = [
+        ":nnapi_delegate",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
new file mode 100644
index 0000000000..0731d14419
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.cc
@@ -0,0 +1,464 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdarg>
+#include <iostream>
+#include <memory>
+#include <vector>
+
+#include "tensorflow/contrib/lite/allocation.h"
+#include "tensorflow/contrib/lite/builtin_op_data.h"
+#include "tensorflow/contrib/lite/builtin_ops.h"
+#include "tensorflow/contrib/lite/context.h"
+#include "tensorflow/contrib/lite/context_util.h"
+#include "tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h"
+#include "tensorflow/contrib/lite/kernels/kernel_util.h"
+#include "tensorflow/contrib/lite/nnapi/NeuralNetworksShim.h"
+
+namespace tflite {
+namespace {
+
+// TODO(b/80621585): Consider printing error string, but don't for now to
+// minimize binary size.
+#define CHECK_NN(context, code)                                           \
+  if (code != ANEURALNETWORKS_NO_ERROR) {                                 \
+    context->ReportError(context, "NN API returned error (%d).\n", code); \
+    return kTfLiteError;                                                  \
+  }
+
+// RAII NN API Model Destructor for use with std::unique_ptr
+struct NNFreeModel {
+  void operator()(ANeuralNetworksModel* model) {
+    ANeuralNetworksModel_free(model);
+  }
+};
+// RAII NN API Compilation Destructor for use with std::unique_ptr
+struct NNFreeCompilation {
+  void operator()(ANeuralNetworksCompilation* model) {
+    ANeuralNetworksCompilation_free(model);
+  }
+};
+
+// Track tensor indices to NN API tensor indices mapping.
+class OperandMapping {
+ public:
+  // Given a TFLite index return the ANN index. If it doesn't exist
+  // return -1.
+  int lite_index_to_ann(int index) const {
+    if (index < lite_tensor_to_ann_tensor_.size())
+      return lite_tensor_to_ann_tensor_[index];
+    else
+      return -1;
+  }
+
+  // NN API uses non tensor operands instead of structs. This creates one
+  // and returns the index. It uses a std::vector and resizes it as needed
+  // keeping -1 to unmapped values. Intermediate tensors likely will not
+  // be mapped.
+  int add_new_non_tensor_operand() { return next_ann_tensor_index_++; }
+
+  // Add a new mapping from `tflite_index` and return the NN API tensor index.
+  int add_new_ann_tensor_index(int tflite_index) {
+    if (tflite_index >= lite_tensor_to_ann_tensor_.size()) {
+      lite_tensor_to_ann_tensor_.resize(tflite_index + 1);
+    }
+    int new_tensor_index = next_ann_tensor_index_++;
+    lite_tensor_to_ann_tensor_[tflite_index] = new_tensor_index;
+    return new_tensor_index;
+  }
+
+ private:
+  // Next index of ann tensor
+  int next_ann_tensor_index_ = 0;
+
+  // Mapping from lite index. Use a std::vector for speed and code size
+  // rather than a map.
+  std::vector<int> lite_tensor_to_ann_tensor_;
+};
+
+// Abstract builder for building an op in the NN API graph. This handles
+// the disparity between TFLite and NN API operand types. NN API has singular
+// operands for both tensors and parameters, and TFLite separates the two.
+class NNAPIOpBuilder {
+ public:
+  NNAPIOpBuilder(TfLiteContext* context, OperandMapping* tensor_mapping,
+                 ANeuralNetworksModel* nn_model)
+      : context_(context),
+        operand_mapping_(tensor_mapping),
+        nn_model_(nn_model) {}
+
+  TfLiteStatus AddScalarInt32Operand(int value) {
+    ANeuralNetworksOperandType operand_type{.type = ANEURALNETWORKS_INT32};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+    int ann_operand = operand_mapping_->add_new_non_tensor_operand();
+    CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
+                           nn_model_, ann_operand, &value, sizeof(int32_t)));
+    augmented_inputs_.push_back(ann_operand);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddTensorInput(int tensor_index) {
+    int ann_index;
+    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
+    augmented_inputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus AddTensorOutput(int tensor_index) {
+    int ann_index;
+    TF_LITE_ENSURE_STATUS(AddTensor(tensor_index, &ann_index));
+    augmented_outputs_.push_back(ann_index);
+    return kTfLiteOk;
+  }
+
+  // Adds a new NN API tensor that shadows the TF Lite tensor `tensor_index`.
+  // This returns the NN API tensor index corresponding to the created tensor.
+  // If another caller previously created a NN API tensor for `tensor_index`
+  // then the existing one is returned.
+  TfLiteStatus AddTensor(int tensor_index, int* ann_tensor_index_out) {
+    int ann_tensor_index = operand_mapping_->lite_index_to_ann(tensor_index);
+    if (ann_tensor_index != -1) {
+      *ann_tensor_index_out = ann_tensor_index;
+      return kTfLiteOk;
+    }
+    // Allocate a new tensor index
+    ann_tensor_index = operand_mapping_->add_new_ann_tensor_index(tensor_index);
+
+    // Parameters needed for new type.
+    int32_t nn_type = 0;
+    float scale = 0.0f;
+    int32_t zeroPoint = 0;
+    TfLiteTensor* tensor = &context_->tensors[tensor_index];
+    switch (tensor->type) {
+      case kTfLiteNoType:
+        // Tensors added during initialization of Ops don't have a type yet and
+        // should not be registered with the NNAPI.
+        *ann_tensor_index_out = -1;
+        return kTfLiteOk;
+      case kTfLiteFloat32:
+        nn_type = ANEURALNETWORKS_TENSOR_FLOAT32;
+        scale = 0.f;
+        break;
+      case kTfLiteUInt8:
+        nn_type = ANEURALNETWORKS_TENSOR_QUANT8_ASYMM;
+        scale = tensor->params.scale;
+        zeroPoint = tensor->params.zero_point;
+        break;
+      case kTfLiteInt32:
+        nn_type = ANEURALNETWORKS_TENSOR_INT32;
+        scale = 0.f;
+        zeroPoint = 0;
+        break;
+      default:
+        context_->ReportError(context_, "Logic error in NN API Delegate.\n");
+        return kTfLiteError;
+    }
+
+    ANeuralNetworksOperandType operand_type{
+        nn_type, static_cast<uint32_t>(tensor->dims->size),
+        reinterpret_cast<uint32_t*>(tensor->dims->data), scale, zeroPoint};
+    CHECK_NN(context_,
+             ANeuralNetworksModel_addOperand(nn_model_, &operand_type));
+
+    if (tensor->allocation_type == kTfLiteMmapRo) {
+      // TODO(b/80630405): Use NNAPIAllocation.
+      CHECK_NN(context_, ANeuralNetworksModel_setOperandValue(
+                             nn_model_, ann_tensor_index, tensor->data.raw,
+                             tensor->bytes));
+    }
+
+    *ann_tensor_index_out = ann_tensor_index;
+    return kTfLiteOk;
+  }
+
+  // Finish emitting the op (of type `type`) into the NN API.
+  TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type) {
+    // Actually add a NN API operation
+    CHECK_NN(context_, ANeuralNetworksModel_addOperation(
+                           nn_model_, type,
+                           static_cast<uint32_t>(augmented_inputs_.size()),
+                           augmented_inputs_.data(),
+                           static_cast<uint32_t>(augmented_outputs_.size()),
+                           augmented_outputs_.data()));
+    augmented_outputs_.clear();
+    augmented_outputs_.clear();
+    return kTfLiteOk;
+  }
+
+ private:
+  // TfLiteContext for error handling. Must be named context for macros to
+  // work.
+  TfLiteContext* context_;
+
+  // Tracks relationship between indices
+  OperandMapping* operand_mapping_;
+
+  // The model
+  ANeuralNetworksModel* nn_model_;
+
+  // Inputs and outputs for the current op. These are augmented in the sense
+  // that NN API uses operands for all arguments, not just tensors, unlike
+  // TensorFlow lite.
+  std::vector<uint32_t> augmented_inputs_;
+  std::vector<uint32_t> augmented_outputs_;
+};
+
+// The kernel that represents the subgraph of TF Lite being run on NN API.
+class NNAPIDelegateKernel {
+ public:
+  NNAPIDelegateKernel() = default;
+
+  typedef ANeuralNetworksOperationType (*MappingFn)(TfLiteContext*,
+                                                    NNAPIOpBuilder* builder,
+                                                    TfLiteNode* node);
+
+  // Return a function that knows how to translate a node into its operands
+  // when called. You can use this function to see if a node is supported
+  // (i.e. that MappingFn is not nullptr).
+  MappingFn Map(TfLiteContext* context, int builtin_code, TfLiteNode* node) {
+    switch (builtin_code) {
+      case kTfLiteBuiltinAdd:
+        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                  TfLiteNode* node) -> ANeuralNetworksOperationType {
+          auto builtin = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
+          builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_ADD;
+        };
+        break;
+      case kTfLiteBuiltinAveragePool2d:
+        return [](TfLiteContext* context, NNAPIOpBuilder* builder,
+                  TfLiteNode* node) -> ANeuralNetworksOperationType {
+          auto builtin =
+              reinterpret_cast<TfLitePoolParams*>(node->builtin_data);
+          builder->AddScalarInt32Operand(builtin->padding);
+          builder->AddScalarInt32Operand(builtin->stride_width);
+          builder->AddScalarInt32Operand(builtin->stride_height);
+          builder->AddScalarInt32Operand(builtin->filter_width);
+          builder->AddScalarInt32Operand(builtin->filter_height);
+          builder->AddScalarInt32Operand(builtin->activation);
+          return ANEURALNETWORKS_AVERAGE_POOL_2D;
+        };
+        break;
+      default:
+        return nullptr;
+    }
+  }
+
+  // Initialize the kernel (a NN model).
+  TfLiteStatus Init(TfLiteContext* context,
+                    const TfLiteDelegateParams* params) {
+    for (auto node_index : TfLiteIntArrayView(params->nodes_to_replace)) {
+      nodes_.push_back(node_index);
+    }
+
+    if (!nn_model_) {
+      ANeuralNetworksModel* model;
+      CHECK_NN(context, ANeuralNetworksModel_create(&model));
+      nn_model_.reset(model);
+
+      TF_LITE_ENSURE_STATUS(
+          BuildGraph(context, params->input_tensors, params->output_tensors));
+    }
+
+    if (!nn_compilation_) {
+      ANeuralNetworksCompilation* compilation;
+      CHECK_NN(context, ANeuralNetworksCompilation_create(nn_model_.get(),
+                                                          &compilation));
+      CHECK_NN(context, ANeuralNetworksCompilation_finish(compilation));
+      nn_compilation_.reset(compilation);
+    }
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus Invoke(TfLiteContext* context, TfLiteNode* node) {
+    ANeuralNetworksExecution* execution = nullptr;
+    CHECK_NN(context, ANeuralNetworksExecution_create(nn_compilation_.get(),
+                                                      &execution));
+
+    // Set the input tensor buffers. Note: we access tflite tensors using
+    // absolute indices but NN api indices inputs by relative indices.
+    int relative_input_index = 0;
+    for (auto absolute_input_index : TfLiteIntArrayView(node->inputs)) {
+      TfLiteTensor* tensor = &context->tensors[absolute_input_index];
+      CHECK_NN(context, ANeuralNetworksExecution_setInput(
+                            execution, relative_input_index, nullptr,
+                            tensor->data.raw, tensor->bytes));
+      relative_input_index++;
+    }
+
+    // Set the output tensor buffers.
+    int relative_output_index = 0;
+    for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+      TfLiteTensor* tensor = &context->tensors[output_index];
+      CHECK_NN(context, ANeuralNetworksExecution_setOutput(
+                            execution, relative_output_index, nullptr,
+                            tensor->data.raw, tensor->bytes));
+      relative_output_index++;
+    }
+    // Invoke ANN in blocking fashion.
+    ANeuralNetworksEvent* event = nullptr;
+    CHECK_NN(context, ANeuralNetworksExecution_startCompute(execution, &event));
+    CHECK_NN(context, ANeuralNetworksEvent_wait(event));
+    ANeuralNetworksEvent_free(event);
+    ANeuralNetworksExecution_free(execution);
+
+    return kTfLiteOk;
+  }
+
+ private:
+  // ANN API state.
+  std::unique_ptr<ANeuralNetworksModel, NNFreeModel> nn_model_;
+  std::unique_ptr<ANeuralNetworksCompilation, NNFreeCompilation>
+      nn_compilation_;
+  // Node indices that this delegate is responsible for. Indices here
+  // indexes into the nodes array in the TfLiteContext.
+  std::vector<int> nodes_;
+  // Track indices we use
+  OperandMapping operand_mapping_;
+
+  TfLiteStatus AddOpsAndTensors(TfLiteContext* context) {
+    // The operand builder allows creating a single op. We create it at this
+    // reduced power position rather than in the for loop to avoid reallocating
+    // the vectors.
+    NNAPIOpBuilder builder(context, &operand_mapping_, nn_model_.get());
+    // Add Tensors
+    // allocate outside to avoid realloc
+    for (auto node_index : nodes_) {
+      // Obtain the op and registration.
+      TfLiteNode* node;
+      TfLiteRegistration* reg;
+      context->GetNodeAndRegistration(context, node_index, &node, &reg);
+      // Map inputs to NN API tensor indices.
+      for (auto input_index : TfLiteIntArrayView(node->inputs)) {
+        TF_LITE_ENSURE_STATUS(builder.AddTensorInput(input_index));
+      }
+      // Get op type and operands
+      int nn_op_type =
+          Map(context, reg->builtin_code, node)(context, &builder, node);
+      // Map outputs to NN API tensor indices.
+      for (auto output_index : TfLiteIntArrayView(node->outputs)) {
+        TF_LITE_ENSURE_STATUS(builder.AddTensorOutput(output_index));
+      }
+
+      builder.FinalizeAddOperation(nn_op_type);
+    }
+    return kTfLiteOk;
+  }
+
+  TfLiteStatus BuildGraph(TfLiteContext* context,
+                          const TfLiteIntArray* input_tensors,
+                          const TfLiteIntArray* output_tensors) {
+    // Build the ops and tensors.
+    TF_LITE_ENSURE_STATUS(AddOpsAndTensors(context));
+    // Map input and output tensor indices to ANN
+    std::vector<uint32_t> inputs;
+    inputs.reserve(input_tensors->size);
+    std::vector<uint32_t> outputs;
+    outputs.reserve(output_tensors->size);
+    // Make the TensorFlow lite inputs and outputs to ann_indices.
+    for (int i : TfLiteIntArrayView(input_tensors))
+      inputs.push_back(operand_mapping_.lite_index_to_ann(i));
+    for (int i : TfLiteIntArrayView(output_tensors))
+      outputs.push_back(operand_mapping_.lite_index_to_ann(i));
+    // Tell ANN to declare inputs/outputs
+    CHECK_NN(context, ANeuralNetworksModel_identifyInputsAndOutputs(
+                          nn_model_.get(), inputs.size(), inputs.data(),
+                          outputs.size(), outputs.data()));
+    // Finalize the model
+    CHECK_NN(context, ANeuralNetworksModel_finish(nn_model_.get()));
+
+    return kTfLiteOk;
+  }
+};
+
+}  // namespace
+
+// Return a NN API Delegate struct that can check for support of ops.
+TfLiteDelegate* NnApiDelegate() {
+  static TfLiteDelegate delegate = {
+      .data_ = nullptr,
+      .Prepare = [](TfLiteContext* context,
+                    TfLiteDelegate* delegate) -> TfLiteStatus {
+        // Do not check nodes_ if NN API is unavailable.
+        if (!NNAPIExists()) return kTfLiteOk;
+
+        std::vector<int> supported_nodes(1);
+        // We don't care about all nodes_, we only care about ones in the
+        // current plan.
+        TfLiteIntArray* plan;
+        TF_LITE_ENSURE_STATUS(context->GetExecutionPlan(context, &plan));
+        int total_supported_nodes = 0;
+        // Check for every node if it is supported
+        // TODO(b/80625235): Fix this to do more careful checking of versioning.
+        for (int node_index : TfLiteIntArrayView(plan)) {
+          TfLiteNode* node;
+          TfLiteRegistration* registration;
+          TF_LITE_ENSURE_STATUS(context->GetNodeAndRegistration(
+              context, node_index, &node, &registration));
+          NNAPIDelegateKernel dummy_kernel;
+          if (dummy_kernel.Map(context, registration->builtin_code, node)) {
+            supported_nodes.push_back(node_index);
+          }
+          total_supported_nodes += 1;
+        }
+        // Put the size at the beginning of the array.
+        supported_nodes[0] = supported_nodes.size() - 1;
+
+        // NN API Delegate Registration (the pseudo kernel that will invoke NN
+        // API subgraphs)
+        static const TfLiteRegistration nnapi_delegate_kernel = {
+            .init = [](TfLiteContext* context, const char* buffer,
+                       size_t length) -> void* {
+              const TfLiteDelegateParams* params =
+                  reinterpret_cast<const TfLiteDelegateParams*>(buffer);
+              NNAPIDelegateKernel* kernel_state = new NNAPIDelegateKernel;
+              kernel_state->Init(context, params);
+              return kernel_state;
+            },
+
+            .free = [](TfLiteContext* context, void* buffer) -> void {
+              delete reinterpret_cast<NNAPIDelegateKernel*>(buffer);
+            },
+
+            .prepare = [](TfLiteContext* context,
+                          TfLiteNode* node) -> TfLiteStatus {
+              // Since the underlying resize happened ahead of delegation
+              // worked. This does nothing.
+              return kTfLiteOk;
+            },
+
+            .invoke = [](TfLiteContext* context,
+                         TfLiteNode* node) -> TfLiteStatus {
+              NNAPIDelegateKernel* state =
+                  reinterpret_cast<NNAPIDelegateKernel*>(node->user_data);
+              return state->Invoke(context, node);
+            },
+
+            .builtin_code = kTfLiteBuiltinDelegate,
+        };
+
+        // Request TFLite to partition the graph and make kernels
+        // for each independent subgraph a new nnapi_delegate_kernel.
+        context->ReplaceSubgraphsWithDelegateKernels(
+            context, nnapi_delegate_kernel,
+            reinterpret_cast<TfLiteIntArray*>(supported_nodes.data()),
+            delegate);
+        return kTfLiteOk;
+      }};
+
+  return &delegate;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
new file mode 100644
index 0000000000..44cca2fd28
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h
@@ -0,0 +1,31 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+#define TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
+
+#include "tensorflow/contrib/lite/context.h"
+
+namespace tflite {
+
+// Return a delegate that can be used to use the NN API.
+// e.g.
+//   NnApiDelegate* delegate = NnApiDelegate();
+//   interpreter->ModifyGraphWithDelegate(&delegate);
+// NnApiDelegate() returns a singleton, so you should not free this
+// pointer or worry about its lifetime.
+TfLiteDelegate* NnApiDelegate();
+}  // namespace tflite
+
+#endif  // TENSORFLOW_CONTRIB_LITE_DELEGATES_NNAPI_NNAPI_DELEGATE_H_
diff --git a/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
new file mode 100644
index 0000000000..ff2e721423
--- /dev/null
+++ b/tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -0,0 +1,82 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/contrib/lite/delegates/nnapi/nnapi_delegate.h"
+#include <gtest/gtest.h>
+#include "tensorflow/contrib/lite/interpreter.h"
+#include "tensorflow/contrib/lite/kernels/test_util.h"
+#include "tensorflow/contrib/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class FloatAddOpModel : public SingleOpModel {
+ public:
+  FloatAddOpModel(const TensorData& input1, const TensorData& input2,
+                  const TensorData& output,
+                  ActivationFunctionType activation_type) {
+    this->SetApplyDelegate([](Interpreter* interpreter) {
+      interpreter->ModifyGraphWithDelegate(NnApiDelegate());
+    });
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_ADD, BuiltinOptions_AddOptions,
+                 CreateAddOptions(builder_, activation_type).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+// Do a test with the NN API using no activation.
+TEST(NNAPIDelegate, AddWithNoActivation) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({-1.9, 0.4, 1.0, 1.3}));
+}
+
+// Do a test with the NN api with relu.
+TEST(NNAPIDelegate, AddWithRelu) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_RELU);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/contrib/lite/kernels/test_util.cc b/tensorflow/contrib/lite/kernels/test_util.cc
index 1a01ee0936..d23ec201b4 100644
--- a/tensorflow/contrib/lite/kernels/test_util.cc
+++ b/tensorflow/contrib/lite/kernels/test_util.cc
@@ -112,6 +112,12 @@ void SingleOpModel::BuildInterpreter(
     if (shape.empty()) continue;
     CHECK(interpreter_->ResizeInputTensor(input_idx, shape) == kTfLiteOk);
   }
+
+  // Modify delegate with function.
+  if (apply_delegate_fn_) {
+    apply_delegate_fn_(interpreter_.get());
+  }
+
   CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
       << "Cannot allocate tensors";
 }
diff --git a/tensorflow/contrib/lite/kernels/test_util.h b/tensorflow/contrib/lite/kernels/test_util.h
index 55edc97d19..db80c0082c 100644
--- a/tensorflow/contrib/lite/kernels/test_util.h
+++ b/tensorflow/contrib/lite/kernels/test_util.h
@@ -114,6 +114,13 @@ class SingleOpModel {
   SingleOpModel() {}
   ~SingleOpModel() {}
 
+  // Set a function callback that is run right after graph is prepared
+  // that allows applying external delegates. This is useful for testing
+  // other runtimes like NN API or GPU.
+  void SetApplyDelegate(std::function<void(Interpreter*)> apply_delegate_fn) {
+    apply_delegate_fn_ = apply_delegate_fn;
+  }
+
   // Copying or assignment is disallowed to simplify ownership semantics.
   SingleOpModel(const SingleOpModel&) = delete;
   SingleOpModel& operator=(const SingleOpModel&) = delete;
@@ -317,6 +324,9 @@ class SingleOpModel {
   std::vector<flatbuffers::Offset<Operator>> operators_;
   std::vector<flatbuffers::Offset<Buffer>> buffers_;
   std::map<string, std::function<TfLiteRegistration*()>> custom_registrations_;
+  // A function pointer that gets called after the interpreter is created but
+  // before evaluation happens. This is useful for applying a delegate.
+  std::function<void(Interpreter*)> apply_delegate_fn_;
 };
 
 // Base class for single op unit tests.
-- 
GitLab


From bab05a2191383b3c66e9ea9ee192aef0aa36c218 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Sun, 3 Jun 2018 18:18:12 -0700
Subject: [PATCH 218/610] [tf.data] Input pipeline rewrites prototype.

This CL:
- adds `tf.contrib.data.optimize()` transformation that can be used to trigger rewrite-based optimization for the input pipeline.
- adds `tf.data.Dataset._as_serialized_graph()` method that returns the serialized graph representation of the dataset

PiperOrigin-RevId: 199068055
---
 .../contrib/data/python/kernel_tests/BUILD    |  13 ++
 .../kernel_tests/optimize_dataset_op_test.py  |  89 ++++++++
 tensorflow/contrib/data/python/ops/BUILD      |  15 ++
 .../contrib/data/python/ops/optimization.py   |  80 +++++++
 .../base_api/api_def_DatasetToGraph.pbtxt     |  20 ++
 .../base_api/api_def_IdentityDataset.pbtxt    |  14 ++
 .../base_api/api_def_OptimizeDataset.pbtxt    |  20 ++
 tensorflow/core/framework/dataset.h           |  19 ++
 tensorflow/core/kernels/BUILD                 |   2 +-
 tensorflow/core/kernels/data/BUILD            |  47 ++++
 tensorflow/core/kernels/data/dataset_ops.cc   |  47 ++++
 .../core/kernels/data/identity_dataset_op.cc  | 102 +++++++++
 .../core/kernels/data/optimize_dataset_op.cc  | 210 ++++++++++++++++++
 tensorflow/core/ops/dataset_ops.cc            |  20 ++
 tensorflow/python/data/kernel_tests/BUILD     |  11 +
 .../data/kernel_tests/dataset_ops_test.py     |  37 +++
 tensorflow/python/data/ops/dataset_ops.py     |   9 +
 17 files changed, 754 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
 create mode 100644 tensorflow/contrib/data/python/ops/optimization.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt
 create mode 100644 tensorflow/core/kernels/data/dataset_ops.cc
 create mode 100644 tensorflow/core/kernels/data/identity_dataset_op.cc
 create mode 100644 tensorflow/core/kernels/data/optimize_dataset_op.cc
 create mode 100644 tensorflow/python/data/kernel_tests/dataset_ops_test.py

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index 523d1f2f71..ba707d8d6e 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -280,6 +280,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "optimize_dataset_op_test",
+    size = "small",
+    srcs = ["optimize_dataset_op_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:optimization",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 py_test(
     name = "prefetch_dataset_op_test",
     size = "small",
diff --git a/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
new file mode 100644
index 0000000000..30f1847dcd
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/optimize_dataset_op_test.py
@@ -0,0 +1,89 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import optimization
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class OptimizeDatasetTest(test.TestCase):
+
+  def testDefaultOptimizations(self):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x).batch(
+        10).apply(optimization.optimize())
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(
+          all([node.op != "MapAndBatchDatasetV2" for node in graph.node]))
+      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testEmptyOptimizations(self):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x).batch(
+        10).apply(optimization.optimize([]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(
+          all([node.op != "MapAndBatchDatasetV2" for node in graph.node]))
+      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testOptimization(self):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x).batch(
+        10).apply(optimization.optimize(["map_and_batch_fusion"]))
+    iterator = dataset.make_one_shot_iterator()
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(
+          any([node.op == "MapAndBatchDatasetV2" for node in graph.node]))
+      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+class OptimizeDatasetSerializationTest(
+    dataset_serialization_test_base.DatasetSerializationTestBase):
+
+  def testCore(self):
+
+    def build_dataset(num_elements, batch_size):
+      return dataset_ops.Dataset.range(num_elements).map(lambda x: x * x).batch(
+          batch_size).apply(optimization.optimize(["map_and_batch_fusion"]))
+
+    self.run_core_tests(lambda: build_dataset(200, 10), None, 20)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index eceecfd174..086661adb7 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -208,6 +208,20 @@ py_library(
     ],
 )
 
+py_library(
+    name = "optimization",
+    srcs = ["optimization.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":contrib_op_loader",
+        ":gen_dataset_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:sparse",
+    ],
+)
+
 py_library(
     name = "resampling",
     srcs = ["resampling.py"],
@@ -368,6 +382,7 @@ py_library(
         ":get_single_element",
         ":grouping",
         ":interleave_ops",
+        ":optimization",
         ":prefetching_ops",
         ":readers",
         ":resampling",
diff --git a/tensorflow/contrib/data/python/ops/optimization.py b/tensorflow/contrib/data/python/ops/optimization.py
new file mode 100644
index 0000000000..cad41bce29
--- /dev/null
+++ b/tensorflow/contrib/data/python/ops/optimization.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for optimizing `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.data.python.ops import contrib_op_loader  # pylint: disable=unused-import
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import sparse
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
+
+
+def optimize(optimizations=None):
+  """A transformation that applies optimizations.
+
+  Args:
+    optimizations: (Optional.) A `tf.string` vector `tf.Tensor` identifying
+      optimizations to use. If not specified, the default set of optimizations
+      is applied.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    """Function from `Dataset` to `Dataset` that applies the transformation."""
+    return OptimizeDataset(dataset, optimizations)
+
+  return _apply_fn
+
+
+class OptimizeDataset(dataset_ops.Dataset):
+  """A `Dataset` that acts as an identity, and applies optimizations."""
+
+  def __init__(self, input_dataset, optimizations):
+    """See `optimize()` for details."""
+    super(OptimizeDataset, self).__init__()
+    self._input_dataset = input_dataset
+    if optimizations is None:
+      optimizations = []
+    self._optimizations = ops.convert_to_tensor(
+        optimizations, dtype=dtypes.string, name="optimizations")
+
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.optimize_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._optimizations,
+        output_shapes=nest.flatten(
+            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+        output_types=nest.flatten(
+            sparse.as_dense_types(self.output_types, self.output_classes)))
+
+  @property
+  def output_classes(self):
+    return self._input_dataset.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._input_dataset.output_shapes
+
+  @property
+  def output_types(self):
+    return self._input_dataset.output_types
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt
new file mode 100644
index 0000000000..55dd6179dd
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToGraph.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "DatasetToGraph"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return the graph representation for.
+END
+  }
+  out_arg {
+    name: "graph"
+    description: <<END
+The graph representation of the dataset (as serialized GraphDef).
+END
+  }
+  summary: "Returns a serialized GraphDef representing `input_dataset`."
+  description: <<END
+Returns a graph representation for `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt
new file mode 100644
index 0000000000..ff2854fd2c
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_IdentityDataset.pbtxt
@@ -0,0 +1,14 @@
+op {
+  graph_op_name: "IdentityDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  summary: "A placeholder for input pipeline graph optimizations."
+  description: <<END
+A placeholder for input pipeline graph optimizations.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt
new file mode 100644
index 0000000000..f26eb6e3c3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_OptimizeDataset.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "OptimizeDataset"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the input dataset.
+END
+  }
+  in_arg {
+    name: "optimizations"
+    description: <<END
+A `tf.string` vector `tf.Tensor` identifying optimizations to use.
+END
+  }
+  summary: "Creates a dataset by applying optimizations to `input_dataset`."
+  description: <<END
+Creates a dataset by applying optimizations to `input_dataset`.
+END
+}
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 23dc903caf..d8618f391e 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -459,6 +459,8 @@ class DatasetBase : public core::RefCounted {
 
   virtual std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const = 0;
+
+  friend class DatasetToGraphOp;  // For access to graph related members.
 };
 
 // Base-class for datasets that are built by ops.
@@ -584,6 +586,23 @@ class DatasetOpKernel : public OpKernel {
     *output = argument_t->scalar<T>()();
     return Status::OK();
   }
+
+  template <typename T>
+  Status ParseVectorArgument(OpKernelContext* ctx,
+                             const StringPiece& argument_name,
+                             std::vector<T>* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsVector(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a vector");
+    }
+    int size = argument_t->vec<T>().size();
+    output->reserve(size);
+    for (int i = 0; i < size; ++i) {
+      output->push_back(argument_t->vec<T>()(i));
+    }
+    return Status::OK();
+  }
 };
 
 // Encapsulates the work required to plug unary Datasets into the core
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index f9e1d37b08..c7c7879714 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -6170,7 +6170,7 @@ cc_library(
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
-        "//tensorflow/core/kernels/data:dataset_ops",
+        "//tensorflow/core/kernels/data",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index d35aad980d..da330e742e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -548,22 +548,69 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "identity_dataset_op",
+    srcs = ["identity_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:framework",
+    ],
+)
+
+tf_kernel_library(
+    name = "optimize_dataset_op",
+    srcs = ["optimize_dataset_op.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:grappler_item_builder",
+        "//tensorflow/core/grappler/clusters:virtual_cluster",
+        "//tensorflow/core/grappler/optimizers:meta_optimizer",
+        "//tensorflow/core/grappler/optimizers/data",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_ops",
+    srcs = ["dataset_ops.cc"],
+    deps = [
+        ":dataset",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "data",
     deps = [
         ":batch_dataset_op",
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
+        ":dataset",
+        ":dataset_ops",
         ":dense_to_sparse_batch_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
         ":group_by_reducer_dataset_op",
         ":group_by_window_dataset_op",
+        ":identity_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
         ":map_and_batch_dataset_op",
         ":map_dataset_op",
+        ":optimize_dataset_op",
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
new file mode 100644
index 0000000000..01989a3bd9
--- /dev/null
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class DatasetToGraphOp : public OpKernel {
+ public:
+  explicit DatasetToGraphOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    GraphDefBuilder b;
+    DatasetBase::DatasetGraphDefBuilder db(&b);
+    Node* input_node = nullptr;
+    OP_REQUIRES_OK(ctx, db.AddParentDataset(ctx, dataset, &input_node));
+    GraphDef graph_def;
+    OP_REQUIRES_OK(ctx, b.ToGraphDef(&graph_def));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<string>()() = graph_def.SerializeAsString();
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
+                        DatasetToGraphOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/identity_dataset_op.cc b/tensorflow/core/kernels/data/identity_dataset_op.cc
new file mode 100644
index 0000000000..e28f188336
--- /dev/null
+++ b/tensorflow/core/kernels/data/identity_dataset_op.cc
@@ -0,0 +1,102 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+
+namespace tensorflow {
+namespace {
+
+// The purpose of identity dataset is to serve as a placeholder when performing
+// optimizations. It is not expected to be surfaced in the Python API.
+class IdentityDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit IdentityDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    *output = new Dataset(ctx, input);
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input)
+        : GraphDatasetBase(ctx), input_(input) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Identity")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override { return "IdentityDatasetOp::Dataset"; }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return errors::Unimplemented(strings::StrCat(prefix(), "::Initialize"));
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return errors::Unimplemented(
+            strings::StrCat(prefix(), "::GetNextInternal"));
+      }
+    };
+
+    const DatasetBase* const input_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("IdentityDataset").Device(DEVICE_CPU),
+                        IdentityDatasetOp);
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
new file mode 100644
index 0000000000..8965858e8d
--- /dev/null
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -0,0 +1,210 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <map>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/grappler/clusters/virtual_cluster.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/kernels/data/dataset.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
+
+namespace tensorflow {
+namespace {
+
+// See documentation in ../ops/dataset_ops.cc for a high-level
+// description of the following op.
+class OptimizeDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit OptimizeDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx),
+        graph_def_version_(ctx->graph_def_version()) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    std::vector<string> optimizations;
+    OP_REQUIRES_OK(
+        ctx, ParseVectorArgument<string>(ctx, "optimizations", &optimizations));
+    Dataset* dataset =
+        new Dataset(ctx, input, optimizations, output_types_, output_shapes_);
+    core::ScopedUnref unref(dataset);
+    OP_REQUIRES_OK(ctx, dataset->Optimize(ctx, output));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const std::vector<string>& optimizations,
+            const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : GraphDatasetBase(ctx),
+          input_(input),
+          optimizations_(optimizations),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Optimize")}));
+    }
+
+    Status Optimize(OpKernelContext* ctx, DatasetBase** output) {
+      GraphDefBuilder b;
+      DatasetGraphDefBuilder db(&b);
+      Node* input_node = nullptr;
+      TF_RETURN_IF_ERROR(db.AddParentDataset(ctx, input_, &input_node));
+      string output_node = input_node->name();
+      GraphDef graph_def;
+      TF_RETURN_IF_ERROR(b.ToGraphDef(&graph_def));
+      TF_RETURN_IF_ERROR(ApplyOptimizations(ctx, &graph_def, &output_node));
+
+      Graph graph(OpRegistry::Global());
+      TF_RETURN_IF_ERROR(ImportGraphDef({}, graph_def, &graph, nullptr));
+      std::vector<Tensor> outputs;
+      GraphRunner graph_runner(ctx->env());
+      // Once rewrites that add/modify functions are introduced, we will need
+      // persist the results in a function library runtime.
+      TF_RETURN_IF_ERROR(graph_runner.Run(&graph, ctx->function_library(), {},
+                                          {output_node}, &outputs));
+      TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], output));
+      (*output)->Ref();
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return errors::Unimplemented(strings::StrCat(prefix(), "::Initialize"));
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        return errors::Unimplemented(
+            strings::StrCat(prefix(), "::GetNextInternal"));
+      }
+    };
+
+    Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
+                              string* output_node) {
+      // Add a fake sink node to allow rewriting the actual sink node.
+      NodeDef* node = graph_def->mutable_node()->Add();
+      node->set_name("FakeSink");
+      node->set_op("IdentityDataset");
+      node->add_input(*output_node);
+      {
+        grappler::GraphView graph(graph_def);
+        NodeDef* sink = graph.GetNode(*output_node);
+        (*node->mutable_attr())["output_shapes"] =
+            sink->attr().at("output_shapes");
+        (*node->mutable_attr())["output_types"] =
+            sink->attr().at("output_types");
+      }
+
+      // Create metagraph.
+      MetaGraphDef meta_graph_def;
+      (*meta_graph_def.mutable_graph_def()) = *graph_def;
+
+      // Grappler determines fetch ops from collection 'train_op'.
+      CollectionDef collection_def;
+      auto node_list = collection_def.mutable_node_list();
+      node_list->add_value("FakeSink");
+      (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
+
+      // Create Grappler item.
+      tensorflow::RewriterConfig rewriter_config;
+      for (const string& optimization : optimizations_) {
+        rewriter_config.add_optimizers(optimization);
+      }
+      // If no optimizations were specified, supply a non-existent optimization
+      // to prevent Grappler from applying the default set of optimizations as
+      // some of them do not work out of the box at the moment (e.g. because we
+      // have no cost model for dataset ops).
+      if (optimizations_.empty()) {
+        rewriter_config.add_optimizers("non-existent");
+      }
+      tensorflow::grappler::ItemConfig item_config;
+      item_config.apply_optimizations = true;
+      std::unique_ptr<tensorflow::grappler::GrapplerItem> grappler_item =
+          tensorflow::grappler::GrapplerItemFromMetaGraphDef(
+              "graph", meta_graph_def, item_config);
+      std::unordered_map<string, tensorflow::DeviceProperties> device_map;
+      tensorflow::grappler::VirtualCluster cluster(device_map);
+
+      // Run optimizer.
+      TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
+          *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
+
+      // Set `output_node` to the input of the fake sink node.
+      {
+        grappler::GraphView graph(graph_def);
+        grappler::GraphView::InputPort input_port =
+            graph.GetInputPort("FakeSink", 0);
+        *output_node = graph.GetRegularFanin(input_port).node->name();
+      }
+
+      return Status::OK();
+    }
+
+    const DatasetBase* input_;
+    const std::vector<string> optimizations_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  const int graph_def_version_;
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("OptimizeDataset").Device(DEVICE_CPU),
+                        OptimizeDatasetOp);
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 6d7d8630a7..9bc6c9a30d 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -698,4 +698,24 @@ REGISTER_OP("DatasetToTFRecord")
     .Input("compression_type: string")
     .SetShapeFn(shape_inference::NoOutputs);
 
+REGISTER_OP("DatasetToGraph")
+    .Input("input_dataset: variant")
+    .Output("graph: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("IdentityDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("OptimizeDataset")
+    .Input("input_dataset: variant")
+    .Input("optimizations: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 }  // namespace tensorflow
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index ed0c11e6c1..c8fabc4363 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -72,6 +72,17 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "dataset_ops_test",
+    size = "small",
+    srcs = ["dataset_ops_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
 tf_py_test(
     name = "filter_dataset_op_test",
     size = "small",
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
new file mode 100644
index 0000000000..2c4c11e132
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/dataset_ops_test.py
@@ -0,0 +1,37 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the input pipeline ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class DatasetOpsTest(test.TestCase):
+
+  def testAsSerializedGraph(self):
+    dataset = dataset_ops.Dataset.range(10)
+    with self.test_session() as sess:
+      graph = graph_pb2.GraphDef().FromString(
+          sess.run(dataset._as_serialized_graph()))
+      self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 6f9b12b123..ea5fc2099c 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -57,6 +57,15 @@ class Dataset(object):
   def __init__(self):
     pass
 
+  def _as_serialized_graph(self):
+    """Produces serialized graph representation of the dataset.
+
+    Returns:
+      A scalar `tf.Tensor` of `tf.string` type, representing this dataset as a
+      serialized graph.
+    """
+    return gen_dataset_ops.dataset_to_graph(self._as_variant_tensor())
+
   @abc.abstractmethod
   def _as_variant_tensor(self):
     """Creates a scalar `tf.Tensor` of `tf.variant` representing this dataset.
-- 
GitLab


From ee8b826051b789882ff885a0fe0c552bcc35f033 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 3 Jun 2018 19:17:44 -0700
Subject: [PATCH 219/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 199071075
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 61 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 61 +++++++++++++++++++
 2 files changed, 122 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 43dafec6f5..61cc3f7c2e 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -16666,6 +16666,17 @@ op {
     }
   }
 }
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+}
 op {
   name: "DatasetToSingleElement"
   input_arg {
@@ -25234,6 +25245,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "IdentityDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "IdentityN"
   input_arg {
@@ -35198,6 +35232,33 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 8c7333e7a4..e73e034340 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -7554,6 +7554,17 @@ op {
     }
   }
 }
+op {
+  name: "DatasetToGraph"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "graph"
+    type: DT_STRING
+  }
+}
 op {
   name: "DatasetToSingleElement"
   input_arg {
@@ -12163,6 +12174,29 @@ op {
     type: "type"
   }
 }
+op {
+  name: "IdentityDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "IdentityN"
   input_arg {
@@ -16913,6 +16947,33 @@ op {
     }
   }
 }
+op {
+  name: "OptimizeDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "optimizations"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "OrderedMapClear"
   attr {
-- 
GitLab


From 327d4dc18f977c3236a1c8049648c33bc1b3a4ae Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Sun, 3 Jun 2018 19:45:38 -0700
Subject: [PATCH 220/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 199072157

---
 tensorflow/go/op/wrappers.go | 706 +++++++++++++++++------------------
 1 file changed, 353 insertions(+), 353 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index c9817e4d61..e4f22692d8 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2771,6 +2771,127 @@ func MatrixDiag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
+type QuantizedInstanceNormAttr func(optionalAttr)
+
+// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
+//
+// value: If True, `given_y_min` and `given_y_min`
+// and `given_y_max` are used as the output range. Otherwise,
+// the implementation computes the output range.
+// If not specified, defaults to false
+func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["output_range_given"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
+//
+// value: Output in `y_min` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_min"] = value
+	}
+}
+
+// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
+//
+// value: Output in `y_max` if `output_range_given` is True.
+// If not specified, defaults to 0
+func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["given_y_max"] = value
+	}
+}
+
+// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
+//
+// value: A small float number to avoid dividing by 0.
+// If not specified, defaults to 1e-05
+func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["variance_epsilon"] = value
+	}
+}
+
+// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
+//
+// value: Minimum value of `y_max - y_min`
+// If not specified, defaults to 0.001
+func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
+	return func(m optionalAttr) {
+		m["min_separation"] = value
+	}
+}
+
+// Quantized Instance normalization.
+//
+// Arguments:
+//	x: A 4D input Tensor.
+//	x_min: The value represented by the lowest quantized input.
+//	x_max: The value represented by the highest quantized input.
+//
+// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
+func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "QuantizedInstanceNorm",
+		Input: []tf.Input{
+			x, x_min, x_max,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// Returns the diagonal part of the tensor.
+//
+// This operation returns a tensor with the `diagonal` part
+// of the `input`. The `diagonal` part is computed as follows:
+//
+// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
+// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
+//
+// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
+//
+// For example:
+//
+// ```
+// # 'input' is [[1, 0, 0, 0]
+//               [0, 2, 0, 0]
+//               [0, 0, 3, 0]
+//               [0, 0, 0, 4]]
+//
+// tf.diag_part(input) ==> [1, 2, 3, 4]
+// ```
+//
+// Arguments:
+//	input: Rank k tensor where k is even and not zero.
+//
+// Returns The extracted diagonal.
+func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DiagPart",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a sequence of numbers.
 //
 // This operation creates a sequence of numbers that begins at `start` and
@@ -2881,57 +3002,6 @@ func StackPopV2(scope *Scope, handle tf.Output, elem_type tf.DataType) (elem tf.
 	return op.Output(0)
 }
 
-// Computes the sum along sparse segments of a tensor.
-//
-// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
-// misisng, the `output` tensor at that position will be zeroed.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// For example:
-//
-// ```python
-// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
-//
-// tf.sparse_segment_sum_with_num_segments(
-//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
-// # => [[0 0 0 0]
-// #     [0 0 0 0]
-// #     [0 0 0 0]]
-//
-// tf.sparse_segment_sum_with_num_segments(c,
-//                                         tf.constant([0, 1]),
-//                                         tf.constant([0, 2],
-//                                         num_segments=4))
-// # => [[ 1  2  3  4]
-// #     [ 0  0  0  0]
-// #     [-1 -2 -3 -4]
-// #     [ 0  0  0  0]]
-// ```
-//
-// Arguments:
-//
-//	indices: A 1-D tensor. Has same rank as `segment_ids`.
-//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
-//	num_segments: Should equal the number of distinct segment IDs.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `num_segments`.
-func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseSegmentSumWithNumSegments",
-		Input: []tf.Input{
-			data, indices, segment_ids, num_segments,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // PreventGradientAttr is an optional argument to PreventGradient.
 type PreventGradientAttr func(optionalAttr)
 
@@ -6852,53 +6922,6 @@ func SerializeTensor(scope *Scope, tensor tf.Output) (serialized tf.Output) {
 	return op.Output(0)
 }
 
-// MatrixSolveAttr is an optional argument to MatrixSolve.
-type MatrixSolveAttr func(optionalAttr)
-
-// MatrixSolveAdjoint sets the optional adjoint attribute to value.
-//
-// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
-// adjoint.
-// If not specified, defaults to false
-func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
-	return func(m optionalAttr) {
-		m["adjoint"] = value
-	}
-}
-
-// Solves systems of linear equations.
-//
-// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
-// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
-// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
-// If `adjoint` is `True` then each output matrix satisfies
-// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
-//
-// Arguments:
-//	matrix: Shape is `[..., M, M]`.
-//	rhs: Shape is `[..., M, K]`.
-//
-// Returns Shape is `[..., M, K]`.
-func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixSolve",
-		Input: []tf.Input{
-			matrix, rhs,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes acos of x element-wise.
 func Acos(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
@@ -21773,35 +21796,151 @@ func TensorListGetItem(scope *Scope, input_handle tf.Output, index tf.Output, el
 	return op.Output(0)
 }
 
-// Computes the matrix exponential of one or more square matrices:
+// Returns a diagonal tensor with a given diagonal values.
 //
-// exp(A) = \sum_{n=0}^\infty A^n/n!
+// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
+// everything else padded with zeros. The diagonal is computed as follows:
 //
-// The exponential is computed using a combination of the scaling and squaring
-// method and the Pade approximation. Details can be founds in:
-// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
-// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
+// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
 //
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
+// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
 //
-// Arguments:
-//	input: Shape is `[..., M, M]`.
+// For example:
 //
-// Returns Shape is `[..., M, M]`.
+// ```
+// # 'diagonal' is [1, 2, 3, 4]
+// tf.diag(diagonal) ==> [[1, 0, 0, 0]
+//                        [0, 2, 0, 0]
+//                        [0, 0, 3, 0]
+//                        [0, 0, 0, 4]]
+// ```
 //
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.expm
-// @end_compatibility
-func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+// Arguments:
+//	diagonal: Rank k tensor where k is at most 1.
+func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "MatrixExponential",
+		Type: "Diag",
 		Input: []tf.Input{
-			input,
+			diagonal,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
+type ParameterizedTruncatedNormalAttr func(optionalAttr)
+
+// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a normal distribution. The parameters may each be a
+//
+// scalar which applies to the entire output, or a vector of length shape[0] which
+// stores the parameters for each batch.
+//
+// Arguments:
+//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
+//	means: The mean parameter of each batch.
+//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
+//	minvals: The minimum cutoff. May be -infinity.
+//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
+// for each batch.
+//
+// Returns A matrix of shape num_batches x samples_per_batch, filled with random
+// truncated normal values using the parameters for each row.
+func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ParameterizedTruncatedNormal",
+		Input: []tf.Input{
+			shape, means, stdevs, minvals, maxvals,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Sets the index-th position of the list to contain the given tensor.
+//
+// input_handle: the list
+// index: the position in the list to which the tensor will be assigned
+// item: the element to be assigned to that position
+// output_handle: the new list, with the element in the proper position
+//
+func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListSetItem",
+		Input: []tf.Input{
+			input_handle, index, item,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the matrix exponential of one or more square matrices:
+//
+// exp(A) = \sum_{n=0}^\infty A^n/n!
+//
+// The exponential is computed using a combination of the scaling and squaring
+// method and the Pade approximation. Details can be founds in:
+// Nicholas J. Higham, "The scaling and squaring method for the matrix exponential
+// revisited," SIAM J. Matrix Anal. Applic., 26:1179-1193, 2005.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.expm
+// @end_compatibility
+func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixExponential",
+		Input: []tf.Input{
+			input,
 		},
 	}
 	op := scope.AddOperation(opspec)
@@ -22148,6 +22287,53 @@ func AdjustSaturation(scope *Scope, images tf.Output, scale tf.Output) (output t
 	return op.Output(0)
 }
 
+// MatrixSolveAttr is an optional argument to MatrixSolve.
+type MatrixSolveAttr func(optionalAttr)
+
+// MatrixSolveAdjoint sets the optional adjoint attribute to value.
+//
+// value: Boolean indicating whether to solve with `matrix` or its (block-wise)
+// adjoint.
+// If not specified, defaults to false
+func MatrixSolveAdjoint(value bool) MatrixSolveAttr {
+	return func(m optionalAttr) {
+		m["adjoint"] = value
+	}
+}
+
+// Solves systems of linear equations.
+//
+// `Matrix` is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. `Rhs` is a tensor of shape `[..., M, K]`. The `output` is
+// a tensor shape `[..., M, K]`.  If `adjoint` is `False` then each output matrix
+// satisfies `matrix[..., :, :] * output[..., :, :] = rhs[..., :, :]`.
+// If `adjoint` is `True` then each output matrix satisfies
+// `adjoint(matrix[..., :, :]) * output[..., :, :] = rhs[..., :, :]`.
+//
+// Arguments:
+//	matrix: Shape is `[..., M, M]`.
+//	rhs: Shape is `[..., M, K]`.
+//
+// Returns Shape is `[..., M, K]`.
+func MatrixSolve(scope *Scope, matrix tf.Output, rhs tf.Output, optional ...MatrixSolveAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixSolve",
+		Input: []tf.Input{
+			matrix, rhs,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SvdAttr is an optional argument to Svd.
 type SvdAttr func(optionalAttr)
 
@@ -25358,6 +25544,57 @@ func CacheDataset(scope *Scope, input_dataset tf.Output, filename tf.Output, out
 	return op.Output(0)
 }
 
+// Computes the sum along sparse segments of a tensor.
+//
+// Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
+// misisng, the `output` tensor at that position will be zeroed.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// For example:
+//
+// ```python
+// c = tf.constant([[1,2,3,4], [-1,-2,-3,-4], [5,6,7,8]])
+//
+// tf.sparse_segment_sum_with_num_segments(
+//     c, tf.constant([0, 1]), tf.constant([0, 0]), num_segments=3)
+// # => [[0 0 0 0]
+// #     [0 0 0 0]
+// #     [0 0 0 0]]
+//
+// tf.sparse_segment_sum_with_num_segments(c,
+//                                         tf.constant([0, 1]),
+//                                         tf.constant([0, 2],
+//                                         num_segments=4))
+// # => [[ 1  2  3  4]
+// #     [ 0  0  0  0]
+// #     [-1 -2 -3 -4]
+// #     [ 0  0  0  0]]
+// ```
+//
+// Arguments:
+//
+//	indices: A 1-D tensor. Has same rank as `segment_ids`.
+//	segment_ids: A 1-D tensor. Values should be sorted and can be repeated.
+//	num_segments: Should equal the number of distinct segment IDs.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `num_segments`.
+func SparseSegmentSumWithNumSegments(scope *Scope, data tf.Output, indices tf.Output, segment_ids tf.Output, num_segments tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseSegmentSumWithNumSegments",
+		Input: []tf.Input{
+			data, indices, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset that executes a SQL query and emits rows of the result set.
 //
 // Arguments:
@@ -27245,122 +27482,6 @@ func TensorArrayConcatV3(scope *Scope, handle tf.Output, flow_in tf.Output, dtyp
 	return op.Output(0), op.Output(1)
 }
 
-// ParameterizedTruncatedNormalAttr is an optional argument to ParameterizedTruncatedNormal.
-type ParameterizedTruncatedNormalAttr func(optionalAttr)
-
-// ParameterizedTruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// ParameterizedTruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ParameterizedTruncatedNormalSeed2(value int64) ParameterizedTruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a normal distribution. The parameters may each be a
-//
-// scalar which applies to the entire output, or a vector of length shape[0] which
-// stores the parameters for each batch.
-//
-// Arguments:
-//	shape: The shape of the output tensor. Batches are indexed by the 0th dimension.
-//	means: The mean parameter of each batch.
-//	stdevs: The standard deviation parameter of each batch. Must be greater than 0.
-//	minvals: The minimum cutoff. May be -infinity.
-//	maxvals: The maximum cutoff. May be +infinity, and must be more than the minval
-// for each batch.
-//
-// Returns A matrix of shape num_batches x samples_per_batch, filled with random
-// truncated normal values using the parameters for each row.
-func ParameterizedTruncatedNormal(scope *Scope, shape tf.Output, means tf.Output, stdevs tf.Output, minvals tf.Output, maxvals tf.Output, optional ...ParameterizedTruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParameterizedTruncatedNormal",
-		Input: []tf.Input{
-			shape, means, stdevs, minvals, maxvals,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Sets the index-th position of the list to contain the given tensor.
-//
-// input_handle: the list
-// index: the position in the list to which the tensor will be assigned
-// item: the element to be assigned to that position
-// output_handle: the new list, with the element in the proper position
-//
-func TensorListSetItem(scope *Scope, input_handle tf.Output, index tf.Output, item tf.Output) (output_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListSetItem",
-		Input: []tf.Input{
-			input_handle, index, item,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns a diagonal tensor with a given diagonal values.
-//
-// Given a `diagonal`, this operation returns a tensor with the `diagonal` and
-// everything else padded with zeros. The diagonal is computed as follows:
-//
-// Assume `diagonal` has dimensions [D1,..., Dk], then the output is a tensor of
-// rank 2k with dimensions [D1,..., Dk, D1,..., Dk] where:
-//
-// `output[i1,..., ik, i1,..., ik] = diagonal[i1, ..., ik]` and 0 everywhere else.
-//
-// For example:
-//
-// ```
-// # 'diagonal' is [1, 2, 3, 4]
-// tf.diag(diagonal) ==> [[1, 0, 0, 0]
-//                        [0, 2, 0, 0]
-//                        [0, 0, 3, 0]
-//                        [0, 0, 0, 4]]
-// ```
-//
-// Arguments:
-//	diagonal: Rank k tensor where k is at most 1.
-func Diag(scope *Scope, diagonal tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Diag",
-		Input: []tf.Input{
-			diagonal,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Split the data from the input value into TensorArray elements.
 //
 // Assuming that `lengths` takes on values
@@ -30589,124 +30710,3 @@ func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// QuantizedInstanceNormAttr is an optional argument to QuantizedInstanceNorm.
-type QuantizedInstanceNormAttr func(optionalAttr)
-
-// QuantizedInstanceNormOutputRangeGiven sets the optional output_range_given attribute to value.
-//
-// value: If True, `given_y_min` and `given_y_min`
-// and `given_y_max` are used as the output range. Otherwise,
-// the implementation computes the output range.
-// If not specified, defaults to false
-func QuantizedInstanceNormOutputRangeGiven(value bool) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["output_range_given"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMin sets the optional given_y_min attribute to value.
-//
-// value: Output in `y_min` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMin(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_min"] = value
-	}
-}
-
-// QuantizedInstanceNormGivenYMax sets the optional given_y_max attribute to value.
-//
-// value: Output in `y_max` if `output_range_given` is True.
-// If not specified, defaults to 0
-func QuantizedInstanceNormGivenYMax(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["given_y_max"] = value
-	}
-}
-
-// QuantizedInstanceNormVarianceEpsilon sets the optional variance_epsilon attribute to value.
-//
-// value: A small float number to avoid dividing by 0.
-// If not specified, defaults to 1e-05
-func QuantizedInstanceNormVarianceEpsilon(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["variance_epsilon"] = value
-	}
-}
-
-// QuantizedInstanceNormMinSeparation sets the optional min_separation attribute to value.
-//
-// value: Minimum value of `y_max - y_min`
-// If not specified, defaults to 0.001
-func QuantizedInstanceNormMinSeparation(value float32) QuantizedInstanceNormAttr {
-	return func(m optionalAttr) {
-		m["min_separation"] = value
-	}
-}
-
-// Quantized Instance normalization.
-//
-// Arguments:
-//	x: A 4D input Tensor.
-//	x_min: The value represented by the lowest quantized input.
-//	x_max: The value represented by the highest quantized input.
-//
-// Returns A 4D Tensor.The value represented by the lowest quantized output.The value represented by the highest quantized output.
-func QuantizedInstanceNorm(scope *Scope, x tf.Output, x_min tf.Output, x_max tf.Output, optional ...QuantizedInstanceNormAttr) (y tf.Output, y_min tf.Output, y_max tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedInstanceNorm",
-		Input: []tf.Input{
-			x, x_min, x_max,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Returns the diagonal part of the tensor.
-//
-// This operation returns a tensor with the `diagonal` part
-// of the `input`. The `diagonal` part is computed as follows:
-//
-// Assume `input` has dimensions `[D1,..., Dk, D1,..., Dk]`, then the output is a
-// tensor of rank `k` with dimensions `[D1,..., Dk]` where:
-//
-// `diagonal[i1,..., ik] = input[i1, ..., ik, i1,..., ik]`.
-//
-// For example:
-//
-// ```
-// # 'input' is [[1, 0, 0, 0]
-//               [0, 2, 0, 0]
-//               [0, 0, 3, 0]
-//               [0, 0, 0, 4]]
-//
-// tf.diag_part(input) ==> [1, 2, 3, 4]
-// ```
-//
-// Arguments:
-//	input: Rank k tensor where k is even and not zero.
-//
-// Returns The extracted diagonal.
-func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DiagPart",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-- 
GitLab


From 320d8056af7799ab20e339757cf379963148425a Mon Sep 17 00:00:00 2001
From: "freedom\" Koan-Sin Tan" <koansin.tan@gmail.com>
Date: Mon, 4 Jun 2018 12:49:17 +0800
Subject: [PATCH 221/610] make toco build for android (#17885)

* make toco build for android

for ARMv8
`
bazel build --config android_arm64 --cxxopt=-std=c++11 --linkopt="-llog" //tensorflow/contrib/lite/toco:toco   --config monolithic
`
for ARMv7a
`
bazel build --config android_arm --cxxopt=-std=c++11 --linkopt="-llog" //tensorflow/contrib/lite/toco:toco   --config monolithic
`

* revert out-of-tflite patch

will do it in another PR

* revert out-of-tflite patch

will do it in another PR
---
 tensorflow/contrib/lite/toco/toco_port.cc |  8 ++++++++
 tensorflow/contrib/lite/toco/toco_port.h  | 18 ++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index a1c8696cd0..49a3302caf 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -18,6 +18,14 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
+#ifdef __ARM_ARCH_7A__
+namespace std {
+double round(double x) {
+  return ::round(x);
+}
+}
+#endif
+
 namespace toco {
 namespace port {
 void CopyToBuffer(const string& src, char* dest) {
diff --git a/tensorflow/contrib/lite/toco/toco_port.h b/tensorflow/contrib/lite/toco/toco_port.h
index 906792ef56..b00b1e89e8 100644
--- a/tensorflow/contrib/lite/toco/toco_port.h
+++ b/tensorflow/contrib/lite/toco/toco_port.h
@@ -33,6 +33,24 @@ limitations under the License.
 #define TFLITE_PROTO_NS google::protobuf
 #endif
 
+#ifdef __ANDROID__
+#include <sstream>
+namespace std {
+
+template <typename T>
+std::string to_string(T value)
+{
+    std::ostringstream os ;
+    os << value ;
+    return os.str() ;
+}
+
+#ifdef __ARM_ARCH_7A__
+double round(double x);
+#endif
+}
+#endif
+
 namespace toco {
 namespace port {
 
-- 
GitLab


From 63dafb7f5dbef4da63e095595a49f5d5d7258af9 Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Mon, 4 Jun 2018 12:50:12 +0800
Subject: [PATCH 222/610] Fix print function with tf_logging.info to keep
 consistence (#18423)

* Fix print function with tf_logging.info to keep consistence

* fix minor typo

* fix pylint errors

* Fix minor pylint errors

* Fix lint error
---
 .../fused_conv2d_bias_activation_op_test.py   | 11 ++++---
 .../python/kernel_tests/betainc_op_test.py    |  4 +--
 .../python/kernel_tests/conv_ops_test.py      | 32 +++++++++----------
 .../python/kernel_tests/pooling_ops_test.py   |  4 +--
 .../tools/quantization/quantize_graph_test.py | 12 +++----
 5 files changed, 32 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
index 3d0ed89932..4d62ac65ff 100644
--- a/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
+++ b/tensorflow/contrib/fused_conv/python/ops/fused_conv2d_bias_activation_op_test.py
@@ -289,8 +289,8 @@ class FusedConv2DBiasActivationTest(test.TestCase):
           conv = tensors[i]
           value = values[i]
           ref_value = ref_values[i]
-          print("expected = ", ref_value)
-          print("actual = ", value)
+          tf_logging.info("expected = ", ref_value)
+          tf_logging.info("actual = ", value)
           tol = 1e-5
           if value.dtype == np.float16:
             tol = 1e-3
@@ -831,7 +831,8 @@ class FusedConvInt8Tests(test.TestCase):
                                                 vertical_stride, padding_type)
     output_width = CalculateConvolvedOutputDim(input_width, filter_width,
                                                horizontal_stride, padding_type)
-    print("output_height=", output_height, ", output_width=", output_width)
+    tf_logging.info("output_height=", output_height, ", output_width=", 
+			                 output_width)
 
     side_input, _, _ = gen_array_ops.quantize_v2(
         random_ops.random_uniform(
@@ -866,8 +867,8 @@ class FusedConvInt8Tests(test.TestCase):
 
     with self.test_session(use_gpu=True) as sess:
       actual_y, expected_y = sess.run([actual, expected])
-      print("actual_y = ", actual_y)
-      print("expected_y = ", expected_y)
+      tf_logging.info("actual_y = ", actual_y)
+      tf_logging.info("expected_y = ", expected_y)
       self.assertTrue(np.array_equal(actual_y, expected_y))
 
   def testFusedConvInt8(self):
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 08b03f8518..16fdedac41 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -172,7 +172,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [gx_s.shape], tf_gout_t, gx_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
       # Test broadcast gradient
@@ -181,7 +181,7 @@ class BetaincTest(test.TestCase):
       tf_gout_t = math_ops.betainc(tf_ga_s, tf_gb_s, tf_gx_s)
       err = gradient_checker.compute_gradient_error(
           [tf_gx_s], [()], tf_gout_t, ga_s.shape)
-      print("betainc gradient err = %g " % err)
+      tf_logging.info("betainc gradient err = %g " % err)
       self.assertLess(err, err_tolerance)
 
 
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index a291bef0ad..450428707d 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -312,8 +312,8 @@ class Conv2DTest(test.TestCase):
       expected_values = self.evaluate(expected_results)
       computed_values = self.evaluate(computed_results)
       for e_value, c_value in zip(expected_values, computed_values):
-        print("expected = ", e_value)
-        print("actual = ", c_value)
+        tf_logging.info("expected = ", e_value)
+        tf_logging.info("actual = ", c_value)
         self.assertAllClose(
             e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=1e-4)
 
@@ -337,8 +337,8 @@ class Conv2DTest(test.TestCase):
       for i in range(len(tensors)):
         conv = tensors[i]
         value = values[i]
-        print("expected = ", expected)
-        print("actual = ", value)
+        tf_logging.info("expected = ", expected)
+        tf_logging.info("actual = ", value)
         tol = 1e-5
         if value.dtype == np.float16:
           tol = 1e-3
@@ -547,8 +547,8 @@ class Conv2DTest(test.TestCase):
       # "values" consists of two tensors for two backprops
       value = self.evaluate(conv)
       self.assertShapeEqual(value, conv)
-    print("expected = ", expected)
-    print("actual = ", value)
+    tf_logging.info("expected = ", expected)
+    tf_logging.info("actual = ", value)
     self.assertArrayNear(expected, value.flatten(), err)
 
   def _CompareBackpropInput(self, input_sizes, filter_sizes, output_sizes,
@@ -723,8 +723,8 @@ class Conv2DTest(test.TestCase):
             data_format=data_format)
         value = self.evaluate(conv)
         self.assertShapeEqual(value, conv)
-      print("expected = ", expected)
-      print("actual = ", value)
+      tf_logging.info("expected = ", expected)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(expected, value.flatten(), 1e-5)
 
   def _CompareBackFilter(self, input_sizes, filter_sizes, output_sizes,
@@ -912,8 +912,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   # Testing for backprops
@@ -965,8 +965,8 @@ class Conv2DTest(test.TestCase):
         value_2 = sess.run(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
-      print("expected = ", value_2)
-      print("actual = ", value)
+      tf_logging.info("expected = ", value_2)
+      tf_logging.info("actual = ", value)
       self.assertArrayNear(value_2.flatten(), value.flatten(), err)
 
   def testConv2D2x2Depth3ValidBackpropFilterStride1x1Dilation2x1(self):
@@ -1178,7 +1178,7 @@ class Conv2DTest(test.TestCase):
           # since fp16 numerical gradients are too imprecise.
           err = np.fabs(jacob_t - reference_jacob_t).max()
 
-        print("conv_2d gradient error = ", err)
+        tf_logging.info("conv_2d gradient error = ", err)
         self.assertLess(err, 0.002)
 
   def testInputGradientValidPaddingStrideOne(self):
@@ -1546,7 +1546,7 @@ class DepthwiseConv2DTest(test.TestCase):
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1668,7 +1668,7 @@ class SeparableConv2DTest(test.TestCase):
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
       value = sess.run(conv)
-    print("value = ", value)
+    tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
@@ -1826,7 +1826,7 @@ class Conv2DBenchmark(test.Benchmark):
         wall_time = time.time() - start
         self.report_benchmark(
             name="conv_stack_iter_%d" % iter_index, wall_time=wall_time)
-        print("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
+        tf_logging.info("conv_stack_iter_%d: %.4f" % (iter_index, wall_time))
 
 
 def GetInceptionFwdTest(input_size, filter_size, stride, padding,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index a0c372db7d..e95c729715 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -947,7 +947,7 @@ class PoolingTest(test.TestCase):
           output_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s gradient error = " % func_name, err)
+    tf_logging.info("%s gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _ConstructAndTestSecondGradient(self,
@@ -1024,7 +1024,7 @@ class PoolingTest(test.TestCase):
           input_sizes,
           x_init_value=x_init_value,
           delta=1e-2)
-    print("%s second-order gradient error = " % func_name, err)
+    tf_logging.info("%s second-order gradient error = " % func_name, err)
     self.assertLess(err, err_tolerance)
 
   def _testMaxPoolGradValidPadding1_1(self, data_format, use_gpu):
diff --git a/tensorflow/tools/quantization/quantize_graph_test.py b/tensorflow/tools/quantization/quantize_graph_test.py
index df71840b64..92bb5127da 100644
--- a/tensorflow/tools/quantization/quantize_graph_test.py
+++ b/tensorflow/tools/quantization/quantize_graph_test.py
@@ -119,8 +119,8 @@ def are_tensors_near(a, b, tolerance):
   flat_a = a.flatten()
   flat_b = b.flatten()
   if len(flat_a) != len(flat_b):
-    print("Tensors are different sizes: " + str(len(flat_a)) + " vs " + str(
-        len(flat_b)))
+    tf_logging.info("Tensors are different sizes: " + str(len(flat_a)) + " vs "
+                    + str(len(flat_b)))
     return False
   value_count = len(flat_a)
   how_many_different = 0
@@ -140,10 +140,10 @@ def are_tensors_near(a, b, tolerance):
   if how_many_different == 0:
     return True
   else:
-    print("Tensors have {0} different values ({1}%), with mean difference"
-          " {2} and mean absolute difference {3}".format(
-              how_many_different, proportion_different * 100, mean_difference,
-              mean_abs_difference))
+    tf_logging.info("Tensors have {0} different values ({1}%), with mean"
+                    " difference {2} and mean absolute difference {3}".format(
+                        how_many_different, proportion_different * 100,
+                        mean_difference, mean_abs_difference))
     return False
 
 
-- 
GitLab


From fd9246a308e77c6c27d5bddcc6646525f3ce5e7b Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Jun 2018 21:51:32 -0700
Subject: [PATCH 223/610] Switch from tf.contrib.metrics to tf.metrics (#18783)

* Switch from tf.contrib.metrics to tf.metrics

Much of the functions in `tf.contrib.metrics` has been
deprecated in favor of `tf.metrics`. This fix
switches `tf.contrib.metrics` to `tf.metrics`.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Change to `tf.metrics` in evaluation_test.py

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 .../slim/python/slim/evaluation_test.py       | 25 ++++++-----
 .../tensor_forest/client/eval_metrics.py      | 45 ++++++++++---------
 2 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/tensorflow/contrib/slim/python/slim/evaluation_test.py b/tensorflow/contrib/slim/python/slim/evaluation_test.py
index 94fc12ca81..3d0308aaf3 100644
--- a/tensorflow/contrib/slim/python/slim/evaluation_test.py
+++ b/tensorflow/contrib/slim/python/slim/evaluation_test.py
@@ -26,7 +26,6 @@ import time
 import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as variables_lib
-from tensorflow.contrib.metrics.python.ops import metric_ops
 from tensorflow.contrib.slim.python.slim import evaluation
 from tensorflow.contrib.training.python.training import evaluation as evaluation_lib
 from tensorflow.core.protobuf import saver_pb2
@@ -37,6 +36,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.platform import gfile
@@ -89,8 +89,8 @@ class EvaluationTest(test.TestCase):
     self._predictions, self._scale = TestModel(self._inputs)
 
   def testFinalOpsOnEvaluationLoop(self):
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
     # Create checkpoint and log directories:
@@ -136,9 +136,10 @@ class EvaluationTest(test.TestCase):
     self.assertTrue(obj.hook_was_run)
 
   def _create_names_to_metrics(self, predictions, labels):
-    accuracy0, update_op0 = metric_ops.streaming_accuracy(predictions, labels)
-    accuracy1, update_op1 = metric_ops.streaming_accuracy(predictions + 1,
-                                                          labels)
+    accuracy0, update_op0 = metrics.accuracy(
+        labels=labels, predictions=predictions)
+    accuracy1, update_op1 = metrics.accuracy(
+        labels=labels, predictions=predictions + 1)
 
     names_to_values = {'Accuracy': accuracy0, 'Another_accuracy': accuracy1}
     names_to_updates = {'Accuracy': update_op0, 'Another_accuracy': update_op1}
@@ -198,8 +199,8 @@ class EvaluationTest(test.TestCase):
     predictions_limited = input.limit_epochs(self._predictions, num_epochs=1)
     labels_limited = input.limit_epochs(self._labels, num_epochs=1)
 
-    value_op, update_op = metric_ops.streaming_accuracy(
-        predictions_limited, labels_limited)
+    value_op, update_op = metrics.accuracy(
+        labels=labels_limited, predictions=predictions_limited)
 
     init_op = control_flow_ops.group(variables.global_variables_initializer(),
                                      variables.local_variables_initializer())
@@ -260,8 +261,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     # Run the evaluation and verify the results:
     accuracy_value = evaluation.evaluate_once(
@@ -276,8 +277,8 @@ class SingleEvaluationTest(test.TestCase):
     self._prepareCheckpoint(checkpoint_path)
 
     # Next, determine the metric to evaluate:
-    value_op, update_op = metric_ops.streaming_accuracy(self._predictions,
-                                                        self._labels)
+    value_op, update_op = metrics.accuracy(
+        labels=self._labels, predictions=self._predictions)
 
     dumping_root = os.path.join(self.get_temp_dir(), 'tfdbg_dump_dir')
     dumping_hook = hooks.DumpingDebugHook(dumping_root, log_usage=False)
diff --git a/tensorflow/contrib/tensor_forest/client/eval_metrics.py b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
index e893e1d1c8..d8236a0a6f 100644
--- a/tensorflow/contrib/tensor_forest/client/eval_metrics.py
+++ b/tensorflow/contrib/tensor_forest/client/eval_metrics.py
@@ -21,10 +21,10 @@ import numpy as np
 
 from tensorflow.contrib import losses
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
-from tensorflow.contrib.metrics.python.ops import metric_ops
 
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
 
 INFERENCE_PROB_NAME = prediction_key.PredictionKey.PROBABILITIES
@@ -38,12 +38,13 @@ def _top_k_generator(k):
     targets = math_ops.to_int32(targets)
     if targets.get_shape().ndims > 1:
       targets = array_ops.squeeze(targets, axis=[1])
-    return metric_ops.streaming_mean(nn.in_top_k(probabilities, targets, k))
+    return metrics.mean(nn.in_top_k(probabilities, targets, k))
   return _top_k
 
 
 def _accuracy(predictions, targets, weights=None):
-  return metric_ops.streaming_accuracy(predictions, targets, weights=weights)
+  return metrics.accuracy(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _r2(probabilities, targets, weights=None):
@@ -53,7 +54,7 @@ def _r2(probabilities, targets, weights=None):
   squares_residuals = math_ops.reduce_sum(
       math_ops.square(targets - probabilities), 0)
   score = 1 - math_ops.reduce_sum(squares_residuals / squares_total)
-  return metric_ops.streaming_mean(score, weights=weights)
+  return metrics.mean(score, weights=weights)
 
 
 def _squeeze_and_onehot(targets, depth):
@@ -62,7 +63,7 @@ def _squeeze_and_onehot(targets, depth):
 
 
 def _sigmoid_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sigmoid_cross_entropy(probabilities,
                                    _squeeze_and_onehot(
                                        targets,
@@ -71,7 +72,7 @@ def _sigmoid_entropy(probabilities, targets, weights=None):
 
 
 def _softmax_entropy(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.sparse_softmax_cross_entropy(probabilities,
                                           math_ops.to_int32(targets)),
       weights=weights)
@@ -82,7 +83,7 @@ def _predictions(predictions, unused_targets, **unused_kwargs):
 
 
 def _class_log_loss(probabilities, targets, weights=None):
-  return metric_ops.streaming_mean(
+  return metrics.mean(
       losses.log_loss(probabilities,
                       _squeeze_and_onehot(targets,
                                           array_ops.shape(probabilities)[1])),
@@ -90,34 +91,36 @@ def _class_log_loss(probabilities, targets, weights=None):
 
 
 def _precision(predictions, targets, weights=None):
-  return metric_ops.streaming_precision(predictions, targets, weights=weights)
+  return metrics.precision(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _precision_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_precision_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.precision_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _recall(predictions, targets, weights=None):
-  return metric_ops.streaming_recall(predictions, targets, weights=weights)
+  return metrics.recall(
+      labels=targets, predictions=predictions, weights=weights)
 
 
 def _recall_at_thresholds(predictions, targets, weights=None):
-  return metric_ops.streaming_recall_at_thresholds(
-      array_ops.slice(predictions, [0, 1], [-1, 1]),
-      targets,
-      np.arange(
-          0, 1, 0.01, dtype=np.float32),
+  return metrics.recall_at_thresholds(
+      labels=targets,
+      predictions=array_ops.slice(predictions, [0, 1], [-1, 1]),
+      thresholds=np.arange(0, 1, 0.01, dtype=np.float32),
       weights=weights)
 
 
 def _auc(probs, targets, weights=None):
-  return metric_ops.streaming_auc(array_ops.slice(probs, [0, 1], [-1, 1]),
-                                  targets, weights=weights)
+  return metrics.auc(
+      labels=targets,
+      predictions=array_ops.slice(probs, [0, 1], [-1, 1]),
+      weights=weights)
 
 
 _EVAL_METRICS = {
-- 
GitLab


From 96788111224e05de619ac2049fb696ae39f1c257 Mon Sep 17 00:00:00 2001
From: Martin Zeitler <syslogic@users.noreply.github.com>
Date: Mon, 4 Jun 2018 06:59:42 +0200
Subject: [PATCH 224/610] Update WORKSPACE (#19638)

* Update WORKSPACE

there was a typo in the comment, also made a sentence out of it.

* Update WORKSPACE
---
 WORKSPACE | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/WORKSPACE b/WORKSPACE
index 4ddfb9a383..44baf78f49 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -37,9 +37,9 @@ load("//tensorflow:workspace.bzl", "tf_workspace")
 #    name="androidndk",
 #    path="<PATH_TO_NDK>",
 #    # This needs to be 14 or higher to compile TensorFlow.
-#    # Please specify API level to >= 21 to build for 64-bit
-#    # archtectures or the Android NDK will automatically select biggest
-#    # API level that it supports without notice.
+#    # Please specify API level >= 21 to build for 64-bit architecture
+#    # otherwise the Android NDK will automatically select the latest
+#    # API level it does support without notice.
 #    # Note that the NDK version is not the API level.
 #    api_level=14)
 
-- 
GitLab


From 44c191906d1e4041b490512facc028a23585717b Mon Sep 17 00:00:00 2001
From: Lukas Geiger <lgeiger@users.noreply.github.com>
Date: Mon, 4 Jun 2018 07:00:10 +0200
Subject: [PATCH 225/610] Support session config in tf.contrib.predictor
 (#19542)

* Support session config in tf.contrib.predictor

This PR allows users to supply a custom session config uses by the predictor.

This can be essential for some GPU setups in order to play nicely with other processes running on the same GPU.

* Test passing session config to tf.contrib.predictor
---
 .../predictor/contrib_estimator_predictor.py  |  5 +++-
 .../predictor/core_estimator_predictor.py     |  5 +++-
 .../contrib/predictor/predictor_factories.py  | 24 ++++++++++++++-----
 .../predictor/predictor_factories_test.py     | 19 +++++++++++++++
 .../predictor/saved_model_predictor.py        |  6 +++--
 5 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/predictor/contrib_estimator_predictor.py b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
index b7a98c68e2..af3b2ad1b5 100644
--- a/tensorflow/contrib/predictor/contrib_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/contrib_estimator_predictor.py
@@ -34,7 +34,8 @@ class ContribEstimatorPredictor(predictor.Predictor):
                prediction_input_fn,
                input_alternative_key=None,
                output_alternative_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `ContribEstimatorPredictor`.
 
     Args:
@@ -48,6 +49,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
         multi-headed models.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -58,6 +60,7 @@ class ContribEstimatorPredictor(predictor.Predictor):
       checkpoint_path = saver.latest_checkpoint(estimator.model_dir)
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_filename_with_path=checkpoint_path))
 
     input_alternative_key = (
diff --git a/tensorflow/contrib/predictor/core_estimator_predictor.py b/tensorflow/contrib/predictor/core_estimator_predictor.py
index d78d94c269..a725072e72 100644
--- a/tensorflow/contrib/predictor/core_estimator_predictor.py
+++ b/tensorflow/contrib/predictor/core_estimator_predictor.py
@@ -51,7 +51,8 @@ class CoreEstimatorPredictor(predictor.Predictor):
                estimator,
                serving_input_receiver_fn,
                output_key=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -62,6 +63,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
         `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     """
     self._graph = graph or ops.Graph()
     with self._graph.as_default():
@@ -71,6 +73,7 @@ class CoreEstimatorPredictor(predictor.Predictor):
       checkpoint_dir = estimator.model_dir
       self._session = monitored_session.MonitoredSession(
           session_creator=monitored_session.ChiefSessionCreator(
+              config=config,
               checkpoint_dir=checkpoint_dir))
 
     feed_tensor_info = signature_def.inputs
diff --git a/tensorflow/contrib/predictor/predictor_factories.py b/tensorflow/contrib/predictor/predictor_factories.py
index 6e77e934fe..f275bc15ad 100644
--- a/tensorflow/contrib/predictor/predictor_factories.py
+++ b/tensorflow/contrib/predictor/predictor_factories.py
@@ -30,7 +30,8 @@ def from_contrib_estimator(estimator,
                            prediction_input_fn,
                            input_alternative_key=None,
                            output_alternative_key=None,
-                           graph=None):
+                           graph=None,
+                           config=None):
   """Constructs a `Predictor` from a `tf.contrib.learn.Estimator`.
 
   Args:
@@ -44,6 +45,7 @@ def from_contrib_estimator(estimator,
       multi-headed models.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -62,13 +64,15 @@ def from_contrib_estimator(estimator,
       prediction_input_fn,
       input_alternative_key=input_alternative_key,
       output_alternative_key=output_alternative_key,
-      graph=graph)
+      graph=graph,
+      config=config)
 
 
 def from_estimator(estimator,
                    serving_input_receiver_fn,
                    output_key=None,
-                   graph=None):
+                   graph=None,
+                   config=None):
   """Constructs a `Predictor` from a `tf.python.estimator.Estimator`.
 
   Args:
@@ -79,6 +83,7 @@ def from_estimator(estimator,
       `None`, then `DEFAULT_SERVING_SIGNATURE_DEF_KEY` is used.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -93,14 +98,19 @@ def from_estimator(estimator,
                     'tf.contrib.learn.Estimator. You likely want to call '
                     'from_contrib_estimator.')
   return core_estimator_predictor.CoreEstimatorPredictor(
-      estimator, serving_input_receiver_fn, output_key=output_key, graph=graph)
+      estimator,
+      serving_input_receiver_fn,
+      output_key=output_key,
+      graph=graph,
+      config=config)
 
 
 def from_saved_model(export_dir,
                      signature_def_key=None,
                      signature_def=None,
                      tags=None,
-                     graph=None):
+                     graph=None,
+                     config=None):
   """Constructs a `Predictor` from a `SavedModel` on disk.
 
   Args:
@@ -115,6 +125,7 @@ def from_saved_model(export_dir,
       `SignatureDef`. Defaults to `DEFAULT_TAGS`.
     graph: Optional. The Tensorflow `graph` in which prediction should be
       done.
+    config: `ConfigProto` proto used to configure the session.
 
   Returns:
     An initialized `Predictor`.
@@ -128,4 +139,5 @@ def from_saved_model(export_dir,
       signature_def_key=signature_def_key,
       signature_def=signature_def,
       tags=tags,
-      graph=graph)
+      graph=graph,
+      config=config)
diff --git a/tensorflow/contrib/predictor/predictor_factories_test.py b/tensorflow/contrib/predictor/predictor_factories_test.py
index 578d9424b2..a2ef1dc3af 100644
--- a/tensorflow/contrib/predictor/predictor_factories_test.py
+++ b/tensorflow/contrib/predictor/predictor_factories_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.predictor import predictor_factories
 from tensorflow.contrib.predictor import testing_common
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.platform import test
 
 MODEL_DIR_NAME = 'contrib/predictor/test_export_dir'
@@ -41,6 +42,11 @@ class PredictorFactoriesTest(test.TestCase):
     """Test loading from_saved_model with tags."""
     predictor_factories.from_saved_model(self._export_dir, tags='serve')
 
+  def testFromSavedModelWithSessionConfig(self):
+    """Test loading from_saved_model with session config."""
+    predictor_factories.from_saved_model(
+        self._export_dir, config=config_pb2.ConfigProto())
+
   def testFromSavedModelWithBadTags(self):
     """Test that loading fails for bad tags."""
     bad_tags_regex = ('.*? could not be found in SavedModel')
@@ -53,6 +59,13 @@ class PredictorFactoriesTest(test.TestCase):
     predictor_factories.from_contrib_estimator(
         estimator, input_fn, output_alternative_key='sum')
 
+  def testFromContribEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=False)
+    input_fn = testing_common.get_arithmetic_input_fn(core=False)
+    predictor_factories.from_contrib_estimator(
+        estimator, input_fn, output_alternative_key='sum',
+        config=config_pb2.ConfigProto())
+
   def testFromContribEstimatorWithCoreEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=True)
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
@@ -64,6 +77,12 @@ class PredictorFactoriesTest(test.TestCase):
     input_fn = testing_common.get_arithmetic_input_fn(core=True)
     predictor_factories.from_estimator(estimator, input_fn)
 
+  def testFromCoreEstimatorWithSessionConfig(self):
+    estimator = testing_common.get_arithmetic_estimator(core=True)
+    input_fn = testing_common.get_arithmetic_input_fn(core=True)
+    predictor_factories.from_estimator(
+        estimator, input_fn, config=config_pb2.ConfigProto())
+
   def testFromCoreEstimatorWithContribEstimatorRaises(self):
     estimator = testing_common.get_arithmetic_estimator(core=False)
     input_fn = testing_common.get_arithmetic_input_fn(core=False)
diff --git a/tensorflow/contrib/predictor/saved_model_predictor.py b/tensorflow/contrib/predictor/saved_model_predictor.py
index 0dbca0f813..95da6d04ed 100644
--- a/tensorflow/contrib/predictor/saved_model_predictor.py
+++ b/tensorflow/contrib/predictor/saved_model_predictor.py
@@ -121,7 +121,8 @@ class SavedModelPredictor(predictor.Predictor):
                input_names=None,
                output_names=None,
                tags=None,
-               graph=None):
+               graph=None,
+               config=None):
     """Initialize a `CoreEstimatorPredictor`.
 
     Args:
@@ -142,6 +143,7 @@ class SavedModelPredictor(predictor.Predictor):
         the correct `SignatureDef`. Defaults to `DEFAULT_TAGS`.
       graph: Optional. The Tensorflow `graph` in which prediction should be
         done.
+      config: `ConfigProto` proto used to configure the session.
     Raises:
       ValueError: If more than one of signature_def_key OR signature_def OR
         (input_names AND output_names) is specified.
@@ -152,7 +154,7 @@ class SavedModelPredictor(predictor.Predictor):
     self._graph = graph or ops.Graph()
 
     with self._graph.as_default():
-      self._session = session.Session()
+      self._session = session.Session(config=config)
       loader.load(self._session, tags.split(','), export_dir)
 
     if input_names is None:
-- 
GitLab


From 59cdb17757a66cd328dd71048ddcd9237218f53a Mon Sep 17 00:00:00 2001
From: Roland Zimmermann <FlashTek@users.noreply.github.com>
Date: Mon, 4 Jun 2018 07:00:42 +0200
Subject: [PATCH 226/610] Add batch support for random tf.image.random_flip_*
 (#19537)

* Add batch support for random tf.image.random_flip_*

* Fixed imports

* Added tests for batched data for tf.image.random_flip_*

* Fixed typo

* Fixed dimension/shape error in _random_flip

* Added unit tests (not benchmarks) for batched data for tf.image.random_flip_*

* Refactored the tf.image.random_flip_* unit tests

* Updated testPartialShapes unit test in image_ops_test

* Fixed formatting

* Fixed imports
---
 tensorflow/python/ops/image_ops_impl.py |  50 ++++---
 tensorflow/python/ops/image_ops_test.py | 178 ++++++++++++++++++------
 2 files changed, 167 insertions(+), 61 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index e907fc470b..4a32f2351b 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_image_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
@@ -258,14 +259,14 @@ def random_flip_up_down(image, seed=None):
   dimension, which is `height`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
-
+    A tensor of the same type and shape as `image`.
   Raises:
     ValueError: if the shape of `image` not supported.
   """
@@ -280,13 +281,14 @@ def random_flip_left_right(image, seed=None):
   second dimension, which is `width`.  Otherwise output the image as-is.
 
   Args:
-    image: A 3-D tensor of shape `[height, width, channels].`
+    image: 4-D Tensor of shape `[batch, height, width, channels]` or
+           3-D Tensor of shape `[height, width, channels]`.
     seed: A Python integer. Used to create a random seed. See
       @{tf.set_random_seed}
       for behavior.
 
   Returns:
-    A 3-D tensor of the same type and shape as `image`.
+    A tensor of the same type and shape as `image`.
 
   Raises:
     ValueError: if the shape of `image` not supported.
@@ -297,7 +299,8 @@ def random_flip_left_right(image, seed=None):
 def _random_flip(image, flip_index, seed, scope_name):
   """Randomly (50% chance) flip an image along axis `flip_index`.
     Args:
-      image: A 3-D tensor of shape `[height, width, channels].`
+      image: 4-D Tensor of shape `[batch, height, width, channels]` or
+             3-D Tensor of shape `[height, width, channels]`.
       flip_index: The dimension along which to flip the image.
                   Vertical: 0, Horizontal: 1
       seed: A Python integer. Used to create a random seed. See
@@ -306,22 +309,37 @@ def _random_flip(image, flip_index, seed, scope_name):
       scope_name: Name of the scope in which the ops are added.
 
     Returns:
-      A 3-D tensor of the same type and shape as `image`.
+      A tensor of the same type and shape as `image`.
 
     Raises:
       ValueError: if the shape of `image` not supported.
   """
   with ops.name_scope(None, scope_name, [image]) as scope:
     image = ops.convert_to_tensor(image, name='image')
-    image = _Assert3DImage(image)
-    uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
-    mirror_cond = math_ops.less(uniform_random, .5)
-    result = control_flow_ops.cond(
-        mirror_cond,
-        lambda: array_ops.reverse(image, [flip_index]),
-        lambda: image,
-        name=scope)
-    return fix_image_flip_shape(image, result)
+    image = _AssertAtLeast3DImage(image)
+    shape = image.get_shape()
+    if shape.ndims == 3 or shape.ndims is None:
+      uniform_random = random_ops.random_uniform([], 0, 1.0, seed=seed)
+      mirror_cond = math_ops.less(uniform_random, .5)
+      result = control_flow_ops.cond(
+          mirror_cond,
+          lambda: array_ops.reverse(image, [flip_index]),
+          lambda: image,
+          name=scope
+      )
+      return fix_image_flip_shape(image, result)
+    elif shape.ndims == 4:
+      uniform_random = random_ops.random_uniform(
+          [array_ops.shape(image)[0]], 0, 1.0, seed=seed
+      )
+      mirror_cond = math_ops.less(uniform_random, .5)
+      return array_ops.where(
+          mirror_cond,
+          image,
+          functional_ops.map_fn(lambda x: array_ops.reverse(x, [flip_index]), image, dtype=image.dtype)
+      )
+    else:
+      raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
 
 @tf_export('image.flip_left_right')
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 72c889a2e6..d50ff3fb60 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -533,6 +533,37 @@ class FlipImageBenchmark(test.Benchmark):
         iters=benchmark_rounds,
         wall_time=step_time)
 
+  def _benchmarkBatchedRandomFlipLeftRight(self, device, cpu_count):
+    image_shape = [16, 299, 299, 3]
+    warmup_rounds = 100
+    benchmark_rounds = 1000
+    config = config_pb2.ConfigProto()
+    if cpu_count is not None:
+      config.inter_op_parallelism_threads = 1
+      config.intra_op_parallelism_threads = cpu_count
+    with session.Session("", graph=ops.Graph(), config=config) as sess:
+      with ops.device(device):
+        inputs = variables.Variable(
+            random_ops.random_uniform(image_shape, dtype=dtypes.float32) * 255,
+            trainable=False,
+            dtype=dtypes.float32)
+        run_op = image_ops.random_flip_left_right(inputs)
+        sess.run(variables.global_variables_initializer())
+        for i in xrange(warmup_rounds + benchmark_rounds):
+          if i == warmup_rounds:
+            start = time.time()
+          sess.run(run_op)
+    end = time.time()
+    step_time = (end - start) / benchmark_rounds
+    tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
+    print("benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s step_time: "
+          "%.2f us" %
+          (tag, step_time * 1e6))
+    self.report_benchmark(
+        name="benchmarkBatchedRandomFlipLeftRight_16_299_299_3_%s" % (tag),
+        iters=benchmark_rounds,
+        wall_time=step_time)
+
   def benchmarkFlipLeftRightCpu1(self):
     self._benchmarkFlipLeftRight("/cpu:0", 1)
 
@@ -551,6 +582,15 @@ class FlipImageBenchmark(test.Benchmark):
   def benchmarkRandomFlipLeftRightGpu(self):
     self._benchmarkRandomFlipLeftRight(test.gpu_device_name(), None)
 
+  def benchmarkBatchedRandomFlipLeftRightCpu1(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", 1)
+
+  def benchmarkBatchedRandomFlipLeftRightCpuAll(self):
+    self._benchmarkBatchedRandomFlipLeftRight("/cpu:0", None)
+
+  def benchmarkBatchedRandomFlipLeftRightGpu(self):
+    self._benchmarkBatchedRandomFlipLeftRight(test.gpu_device_name(), None)
+
 
 class AdjustHueBenchmark(test.Benchmark):
 
@@ -987,7 +1027,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
 
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_left_right"))
 
       count_flipped = 0
@@ -1008,6 +1048,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipLeftRightWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[3, 2, 1], [3, 2, 1]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_left_right(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_left_right"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1057,9 +1141,11 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
 
+    seed = 42
+
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf, seed=42)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
       self.assertTrue(y.op.name.startswith("random_flip_up_down"))
       count_flipped = 0
       count_unflipped = 0
@@ -1079,6 +1165,50 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  def testRandomFlipUpDownWithBatch(self):
+    batch_size = 16
+    seed = 42
+
+    # create single item of test data
+    x_np_raw = np.array(
+        [[1, 2, 3], [4, 5, 6]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+    y_np_raw = np.array(
+        [[4, 5, 6], [1, 2, 3]], dtype=np.uint8
+    ).reshape([1, 2, 3, 1])
+
+    # create batched test data
+    x_np = np.vstack([x_np_raw for _ in range(batch_size)])
+    y_np = np.vstack([y_np_raw for _ in range(batch_size)])
+
+    with self.test_session(use_gpu=True):
+      x_tf = constant_op.constant(x_np, shape=x_np.shape)
+      y = image_ops.random_flip_up_down(x_tf, seed=seed)
+      self.assertTrue(y.op.name.startswith("random_flip_up_down"))
+
+      count_flipped = 0
+      count_unflipped = 0
+      for _ in range(100):
+        y_tf = y.eval()
+
+        # check every element of the batch
+        for i in range(batch_size):
+          if y_tf[i][0][0] == 1:
+            self.assertAllEqual(y_tf[i], x_np[i])
+            count_unflipped += 1
+          else:
+            self.assertAllEqual(y_tf[i], y_np[i])
+            count_flipped += 1
+
+      # 100 trials, each containing batch_size elements
+      # Mean: 50 * batch_size
+      # Std Dev: ~5 * sqrt(batch_size)
+      # Six Sigma: 50 * batch_size - (5 * 6 * sqrt(batch_size))
+      #          = 50 * batch_size - 30 * sqrt(batch_size) = 800 - 30 * 4 = 680
+      six_sigma = 50 * batch_size - 30 * np.sqrt(batch_size)
+      self.assertGreaterEqual(count_flipped, six_sigma)
+      self.assertGreaterEqual(count_unflipped, six_sigma)
+
   def testInvolutionTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
 
@@ -1156,6 +1286,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     #Ops that support 4D input
     for op in [
         image_ops.flip_left_right, image_ops.flip_up_down,
+        image_ops.random_flip_left_right, image_ops.random_flip_up_down,
         image_ops.transpose_image, image_ops.rot90
     ]:
       transformed_unknown_dims_4 = op(p_unknown_dims_4)
@@ -1166,14 +1297,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
                                    "must be at least three-dimensional"):
         op(p_wrong_rank)
 
-    for op in [
-        image_ops.random_flip_left_right,
-        image_ops.random_flip_up_down,
-    ]:
-      with self.assertRaisesRegexp(ValueError, "must be three-dimensional"):
-        op(p_wrong_rank)
-
-
   def testRot90GroupOrder(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1208,41 +1331,6 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k, axes=(1, 2))
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
-class RandomFlipTest(test_util.TensorFlowTestCase):
-
-  def testRandomLeftRight(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([1, 2, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_left_right(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-  def testRandomUpDown(self):
-    x_np = np.array([0, 1], dtype=np.uint8).reshape([2, 1, 1])
-    num_iterations = 500
-
-    hist = [0, 0]
-    with self.test_session(use_gpu=True):
-      x_tf = constant_op.constant(x_np, shape=x_np.shape)
-      y = image_ops.random_flip_up_down(x_tf)
-      for _ in xrange(num_iterations):
-        y_np = y.eval().flatten()[0]
-        hist[y_np] += 1
-
-    # Ensure that each entry is observed within 4 standard deviations.
-    four_stddev = 4.0 * np.sqrt(num_iterations / 2.0)
-    self.assertAllClose(hist, [num_iterations / 2.0] * 2, atol=four_stddev)
-
-
 class AdjustContrastTest(test_util.TensorFlowTestCase):
 
   def _testContrast(self, x_np, y_np, contrast_factor):
-- 
GitLab


From 75cc287d71d12ef5e9284c8b1ac44856b8c220c3 Mon Sep 17 00:00:00 2001
From: leiiwang <u2takey@gmail.com>
Date: Mon, 4 Jun 2018 13:03:15 +0800
Subject: [PATCH 227/610] check grpc_testlib_server before start subprocess
 (#19356)

---
 .../core/distributed_runtime/rpc/grpc_testlib.cc       | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index 89f83f9f24..a8508d2d4f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
@@ -50,9 +51,14 @@ Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
   }
 
   for (int i = 0; i < n; ++i) {
+    string server_file =
+        strings::StrCat(testing::TensorFlowSrcRoot(),
+                        "/core/distributed_runtime/rpc/grpc_testlib_server");
+    if (!options.env->FileExists(server_file).ok()) {
+      return errors::Internal("Could not find grpc_testlib_server");
+    }
     const std::vector<string> argv(
-        {strings::StrCat(testing::TensorFlowSrcRoot(),
-                         "/core/distributed_runtime/rpc/grpc_testlib_server"),
+        {server_file,
          /* see grpc_testlib_server.cc for flags */
          tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
          strings::StrCat("--num_cpus=", num_cpus),
-- 
GitLab


From 62cf06bfa94af4c97cdd8ca5c4faab49accc7a12 Mon Sep 17 00:00:00 2001
From: ManHyuk <manhyuk@kw.ac.kr>
Date: Mon, 4 Jun 2018 14:04:07 +0900
Subject: [PATCH 228/610] fix typo (#19689)

---
 tensorflow/contrib/metrics/python/ops/metric_ops.py | 2 +-
 tensorflow/python/ops/nn_impl.py                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 00a933e5e0..91df5cb07b 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -2496,7 +2496,7 @@ def _compute_recall_at_precision(tp, fp, fn, precision, name):
     name: An optional variable_scope name.
 
   Returns:
-    The recall at a the given `precision`.
+    The recall at a given `precision`.
   """
   precisions = math_ops.div(tp, tp + fp + _EPSILON)
   tf_index = math_ops.argmin(
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index e2ef1f66b1..f47f38e29e 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -621,7 +621,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
 
   Args:
-    counts: A `Tensor` containing a the total count of the data (one value).
+    counts: A `Tensor` containing the total count of the data (one value).
     mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly
       shifted) sum of the elements to average over.
     variance_ss: A `Tensor` containing the variance sufficient statistics: the
-- 
GitLab


From 32dcb8f0141468c0d93e0c7166e549c9f36db4a1 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Jun 2018 22:05:37 -0700
Subject: [PATCH 229/610] Fix broken link in tfmobile documentation (#16993)

This fix fixes the broken link in tfmobile documentation as
android/ had been moved to android/tfmobile/ in the repo
googlecodelabs/tensorflow-for-poets-2

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/docs_src/mobile/linking_libs.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/mobile/linking_libs.md b/tensorflow/docs_src/mobile/linking_libs.md
index cf0db59021..efef5dd0da 100644
--- a/tensorflow/docs_src/mobile/linking_libs.md
+++ b/tensorflow/docs_src/mobile/linking_libs.md
@@ -27,7 +27,7 @@ called `libandroid_tensorflow_inference_java.jar`. There are three ways to
 include this functionality in your program:
 
 1. Include the jcenter AAR which contains it, as in this
- [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/build.gradle#L59-L65)
+ [example app](https://github.com/googlecodelabs/tensorflow-for-poets-2/blob/master/android/tfmobile/build.gradle#L59-L65)
 
 2. Download the nightly precompiled version from
 [ci.tensorflow.org](http://ci.tensorflow.org/view/Nightly/job/nightly-android/lastSuccessfulBuild/artifact/out/).
-- 
GitLab


From e753372d01124824aadf030d36e06ac5a986516c Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Mon, 4 Jun 2018 13:08:20 +0800
Subject: [PATCH 230/610] Fix the quantized table order by float value for easy
 reading (#17898)

---
 tensorflow/docs_src/performance/quantization.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/performance/quantization.md b/tensorflow/docs_src/performance/quantization.md
index 2fea02d861..c97f74139c 100644
--- a/tensorflow/docs_src/performance/quantization.md
+++ b/tensorflow/docs_src/performance/quantization.md
@@ -227,8 +227,8 @@ of 30.0f, and an 8-bit array, the quantized values represent the following:
   <table>
     <tr><th>Quantized</th><th>Float</th></tr>
     <tr><td>0</td><td>-10.0</td></tr>
-    <tr><td>255</td><td>30.0</td></tr>
     <tr><td>128</td><td>10.0</td></tr>
+    <tr><td>255</td><td>30.0</td></tr>
   </table>
   <figcaption>
     <b>Table 2</b>: Example quantized value range
-- 
GitLab


From 506eaaaee694a19d271eba87a8e3f9023931a384 Mon Sep 17 00:00:00 2001
From: ImSheridan <xiaoyudong0512@gmail.com>
Date: Mon, 4 Jun 2018 13:11:34 +0800
Subject: [PATCH 231/610] Fix some minor incorrect anchor links (#18348)

* Fix the incorrect link of PrepareLinux or PrepareMacOS

* Fix incorrect link of common_installation_problems also

* Fix not work anchor PrepareLinux issue
---
 tensorflow/docs_src/install/install_sources.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/docs_src/install/install_sources.md b/tensorflow/docs_src/install/install_sources.md
index 5ba522b436..cc29074757 100644
--- a/tensorflow/docs_src/install/install_sources.md
+++ b/tensorflow/docs_src/install/install_sources.md
@@ -81,7 +81,7 @@ or
 [macOS](#PrepareMac)
 
 
-<a name="#PrepareLinux"></a>
+<a name="PrepareLinux"></a>
 ## Prepare environment for Linux
 
 Before building TensorFlow on Linux, install the following build
@@ -373,9 +373,9 @@ The build and installation problems you encounter typically depend on the
 operating system.  See the "Common installation problems" section
 of one of the following guides:
 
-  * @{$install_linux#CommonInstallationProblems$Installing TensorFlow on Linux}
-  * @{$install_mac#CommonInstallationProblems$Installing TensorFlow on Mac OS}
-  * @{$install_windows#CommonInstallationProblems$Installing TensorFlow on Windows}
+  * @{$install_linux#common_installation_problems$Installing TensorFlow on Linux}
+  * @{$install_mac#common_installation_problems$Installing TensorFlow on Mac OS}
+  * @{$install_windows#common_installation_problems$Installing TensorFlow on Windows}
 
 Beyond the errors documented in those two guides, the following table
 notes additional errors specific to building TensorFlow.  Note that we
-- 
GitLab


From b933be02b97cdb42a86548f73697654d4c5d0f56 Mon Sep 17 00:00:00 2001
From: Sergei Lebedev <superbobry@gmail.com>
Date: Mon, 4 Jun 2018 07:12:36 +0200
Subject: [PATCH 232/610] Fallback to dynamic loader even if HADOOP_HDFS_HOME
 is not defined (#19336)

* Fallback to dynamic loader even if HADOOP_HDFS_HOME is not defined

Prior to this commit HadoopFileSystem required HADOOP_HDFS_HOME to be
defined to initialize the filesystem, even if libhdfs.so is located
outside of the standard location. This limitation is unnecessary and
can be safely removed.

As a nice side-effect, the error message is now more informative.

Before:

    Environment variable HADOOP_HDFS_HOME not set

After:

    libhdfs.so: cannot open shared object file: No such file or directory

Change-Id: Ief6a8679d7ef353003aa387f7767ebaa8ef290ce

* Addressed review comments

Change-Id: I703d57e022744e26d1b47732beeaa48c073bd5fc
---
 .../platform/hadoop/hadoop_file_system.cc     | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/platform/hadoop/hadoop_file_system.cc b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
index 72c12318ca..ff4b4436bb 100644
--- a/tensorflow/core/platform/hadoop/hadoop_file_system.cc
+++ b/tensorflow/core/platform/hadoop/hadoop_file_system.cc
@@ -115,18 +115,17 @@ class LibHDFS {
     const char* kLibHdfsDso = "libhdfs.so";
 #endif
     char* hdfs_home = getenv("HADOOP_HDFS_HOME");
-    if (hdfs_home == nullptr) {
-      status_ = errors::FailedPrecondition(
-          "Environment variable HADOOP_HDFS_HOME not set");
-      return;
-    }
-    string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
-    status_ = TryLoadAndBind(path.c_str(), &handle_);
-    if (!status_.ok()) {
-      // try load libhdfs.so using dynamic loader's search path in case
-      // libhdfs.so is installed in non-standard location
-      status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
+    if (hdfs_home != nullptr) {
+      string path = io::JoinPath(hdfs_home, "lib", "native", kLibHdfsDso);
+      status_ = TryLoadAndBind(path.c_str(), &handle_);
+      if (status_.ok()) {
+        return;
+      }
     }
+
+    // Try to load the library dynamically in case it has been installed
+    // to a in non-standard location.
+    status_ = TryLoadAndBind(kLibHdfsDso, &handle_);
   }
 
   Status status_;
-- 
GitLab


From a8ae26ae1aa7a33b48cca8bf12c42ab7503a45cf Mon Sep 17 00:00:00 2001
From: Evgeniy Zheltonozhskiy <zheltonozhskiy@gmail.com>
Date: Mon, 4 Jun 2018 08:12:47 +0300
Subject: [PATCH 233/610] Fix fake quantization link (#19278)

---
 tensorflow/contrib/quantize/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index c83623ec94..27a933c0f9 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -6,7 +6,7 @@ inference. The details of the transformation implemented in this package is
 described here [1].
 
 This is done using the
-[fake quantization op](https://www.tensorflow.org/versions/r0.12/api_docs/python/array_ops/fake_quantization).
+[fake quantization op](https://www.tensorflow.org/api_guides/python/array_ops#Fake_quantization).
 
 Literature has shown that fixed point networks provide comparable performance to
 floating point networks [2]. This is achieved by modeling the quantization
-- 
GitLab


From c36bda171673884c0f3829fac3a342733d6040f8 Mon Sep 17 00:00:00 2001
From: jsawruk <jeremy.sawruk@gmail.com>
Date: Mon, 4 Jun 2018 01:40:23 -0400
Subject: [PATCH 234/610] Update mobile prepare models documentation: correct
 location of freeze_graph (#18968)

---
 tensorflow/docs_src/mobile/prepare_models.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/docs_src/mobile/prepare_models.md b/tensorflow/docs_src/mobile/prepare_models.md
index 8b22c04d87..2b84dbb973 100644
--- a/tensorflow/docs_src/mobile/prepare_models.md
+++ b/tensorflow/docs_src/mobile/prepare_models.md
@@ -105,8 +105,8 @@ inline constants so everything’s in one file.  To handle the conversion, you
 need the `freeze_graph.py` script, that’s held in
 [`tensorflow/python/tools/freeze_graph.py`](https://www.tensorflow.org/code/tensorflow/python/tools/freeze_graph.py). You’ll run it like this:
 
-    bazel build tensorflow/tools:freeze_graph
-    bazel-bin/tensorflow/tools/freeze_graph \
+    bazel build tensorflow/python/tools:freeze_graph
+    bazel-bin/tensorflow/python/tools/freeze_graph \
     --input_graph=/tmp/model/my_graph.pb \
     --input_checkpoint=/tmp/model/model.ckpt-1000 \
     --output_graph=/tmp/frozen_graph.pb \
-- 
GitLab


From a0fd55070bb83e369d1d73e777fc1ea9f1c3a6ae Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Sun, 3 Jun 2018 22:41:13 -0700
Subject: [PATCH 235/610] Replace direct download link with bazel mirror
 (mirror.bazel.build) (#19713)

* Replace direct download link with bazel mirror (mirror.bazel.build)

Since the download package for gemmlowp has been propagated
to the bazel mirror (mirror.bazel.build), this fix replaced the
direct link with the mirrored one, and removed the related TODO.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Remove TODO in tensorflow/contrib/lite/download_dependencies.sh

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/lite/download_dependencies.sh     | 4 +---
 tensorflow/contrib/makefile/download_dependencies.sh | 4 +---
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/download_dependencies.sh b/tensorflow/contrib/lite/download_dependencies.sh
index 436c3e1d4c..840015a7fa 100755
--- a/tensorflow/contrib/lite/download_dependencies.sh
+++ b/tensorflow/contrib/lite/download_dependencies.sh
@@ -30,9 +30,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 ABSL_URL="$(grep -o 'https://github.com/abseil/abseil-cpp/.*tar.gz' "${BZL_FILE_PATH}" | head -n1)"
 NEON_2_SSE_URL="https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip"
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index eff9081e35..48953e2e38 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -27,9 +27,7 @@ if [ ! -f $BZL_FILE_PATH ]; then
 fi
 
 EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE_PATH}" | grep -v mirror.bazel | head -n1)"
-# TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' once
-# the archive has been propagated in mirror.bazel.build.
-GEMMLOWP_URL="$(grep -o 'https://github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
+GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
 PROTOBUF_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/protobuf/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-- 
GitLab


From 5d44932cda0e88537eb2526c7a420ee4ba320619 Mon Sep 17 00:00:00 2001
From: "William D. Irons" <wdirons@us.ibm.com>
Date: Mon, 4 Jun 2018 00:42:12 -0500
Subject: [PATCH 236/610] fix iris example to work with python3 (#19335)

iris.py did not work with python3 as urllib.urlopen is not in python3.
Switched to urlretrive from six. Same was done in:
tensorflow/examples/image_retraining/retrain.py
---
 tensorflow/examples/learn/iris.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/tensorflow/examples/learn/iris.py b/tensorflow/examples/learn/iris.py
index 03e60972aa..86f5204ec3 100644
--- a/tensorflow/examples/learn/iris.py
+++ b/tensorflow/examples/learn/iris.py
@@ -21,7 +21,8 @@ from __future__ import division
 from __future__ import print_function
 
 import os
-import urllib
+
+from six.moves.urllib.request import urlretrieve
 
 import tensorflow as tf
 
@@ -38,9 +39,7 @@ FEATURE_KEYS = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
 def maybe_download_iris_data(file_name, download_url):
   """Downloads the file and returns the number of data."""
   if not os.path.exists(file_name):
-    raw = urllib.urlopen(download_url).read()
-    with open(file_name, 'w') as f:
-      f.write(raw)
+    urlretrieve(download_url, file_name)
 
   # The first line is a comma-separated string. The first one is the number of
   # total data in the file.
-- 
GitLab


From 869dc9165e9d58c6a6f49c2ff54a837346fa9b1d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 01:07:18 -0700
Subject: [PATCH 237/610] Add debug output to CHECK for compatible shapes of
 multi-output fusions.

PiperOrigin-RevId: 199091580
---
 tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 0728ccfff7..dc2934a34c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -83,7 +83,9 @@ LoopEmitter::LoopEmitter(const ElementGenerator& target_element_generator,
   // Sanity check: In multi-output fusion, all shapes produced must have the
   // same dimensions.
   for (const IrArray& array : target_arrays) {
-    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()));
+    CHECK(ShapeUtil::SameDimensions(shape_, array.GetShape()))
+        << ": '" << shape_.ShortDebugString() << "' does not match '"
+        << array.GetShape().ShortDebugString() << "'";
   }
 }
 
-- 
GitLab


From 5b498d5d759aa0545990e20778884b465eeb1ad3 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 4 Jun 2018 03:57:01 -0700
Subject: [PATCH 238/610] [XLA] Remove unnecessary std::vector copies

We can just pass along the original ArraySlice.

PiperOrigin-RevId: 199109815
---
 .../compiler/xla/service/llvm_ir/llvm_util.cc      | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index bd45f83fb1..ff64da87e9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -87,18 +87,10 @@ llvm::Value* EmitCallToIntrinsic(
     tensorflow::gtl::ArraySlice<llvm::Value*> operands,
     tensorflow::gtl::ArraySlice<llvm::Type*> overloaded_types,
     llvm::IRBuilder<>* ir_builder) {
-  std::vector<llvm::Type*> types;
-  for (auto type : overloaded_types) {
-    types.push_back(type);
-  }
   llvm::Module* module = ModuleFromIRBuilder(ir_builder);
-  llvm::Function* intrinsic =
-      llvm::Intrinsic::getDeclaration(module, intrinsic_id, types);
-  std::vector<llvm::Value*> operands_vec;
-  for (auto operand : operands) {
-    operands_vec.push_back(operand);
-  }
-  return ir_builder->CreateCall(intrinsic, operands_vec);
+  llvm::Function* intrinsic = llvm::Intrinsic::getDeclaration(
+      module, intrinsic_id, AsArrayRef(overloaded_types));
+  return ir_builder->CreateCall(intrinsic, AsArrayRef(operands));
 }
 
 llvm::Value* EmitFloatMax(llvm::Value* lhs_value, llvm::Value* rhs_value,
-- 
GitLab


From 92415c09b8d00f200429e994b08e302f4ca85e67 Mon Sep 17 00:00:00 2001
From: Vikram Tankasali <tvikram@google.com>
Date: Mon, 4 Jun 2018 05:40:33 -0700
Subject: [PATCH 239/610] Update README.md for tf.contrib.kfac and add
 deprecation warning.

PiperOrigin-RevId: 199119904
---
 tensorflow/contrib/kfac/README.md               | 5 +++++
 tensorflow/contrib/kfac/python/ops/optimizer.py | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/tensorflow/contrib/kfac/README.md b/tensorflow/contrib/kfac/README.md
index 762a2f0b57..102626925d 100644
--- a/tensorflow/contrib/kfac/README.md
+++ b/tensorflow/contrib/kfac/README.md
@@ -1,5 +1,10 @@
 # K-FAC: Kronecker-Factored Approximate Curvature
 
+# <font color="red", size=10><u>WARNING: </u></font>
+# ==third_party/tensorflow/contrib/kfac is deprecated. This will be==
+# ==removed on 15-07-2018. <!-- STY:begin_strip_and_replace -->Please import third_party/tensorflow_kfac.==
+# ==<!-- STY:end_strip_and_replace Please check https://github.com/tensorflow/kfac. -->==
+
 **K-FAC in TensorFlow** is an implementation of [K-FAC][kfac-paper], an
 approximate second-order optimization method, in TensorFlow. When applied to
 feedforward and convolutional neural networks, K-FAC can converge `>3.5x`
diff --git a/tensorflow/contrib/kfac/python/ops/optimizer.py b/tensorflow/contrib/kfac/python/ops/optimizer.py
index b7f63d8d94..03b9da7933 100644
--- a/tensorflow/contrib/kfac/python/ops/optimizer.py
+++ b/tensorflow/contrib/kfac/python/ops/optimizer.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
 # pylint disable=long-line
 from tensorflow.contrib.kfac.python.ops import curvature_matrix_vector_products as cmvp
 from tensorflow.contrib.kfac.python.ops import estimator as est
@@ -107,6 +109,10 @@ class KfacOptimizer(gradient_descent.GradientDescentOptimizer):
       ValueError: If momentum is non-zero and momentum_type is not 'regular'
           or 'adam'.
     """
+    warnings.warn(
+        "third_party.tensorflow.contrib.kfac is deprecated."
+        "This will be removed on 15-07-2018. Check README for further details.",
+        DeprecationWarning)
     # Parameters to be passed to the Fisher estimator:
     self._variables = var_list or tf_variables.trainable_variables
     self._cov_ema_decay = cov_ema_decay
-- 
GitLab


From 256ef4232d6551c2d1099eb2b932737e83f33f77 Mon Sep 17 00:00:00 2001
From: Tom Hennigan <tomhennigan@google.com>
Date: Mon, 4 Jun 2018 06:47:07 -0700
Subject: [PATCH 240/610] Add stored eager variables to graph collections.

PiperOrigin-RevId: 199125920
---
 tensorflow/python/framework/ops.py            | 17 +++---------
 .../kernel_tests/variable_scope_test.py       | 26 +++++++++++++++++++
 .../python/ops/resource_variable_ops.py       |  3 +++
 tensorflow/python/ops/variable_scope.py       | 10 ++++++-
 4 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index 6f3bb5563b..eceea5276a 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -3882,7 +3882,6 @@ class Graph(object):
         contains many standard names for collections.
       value: The value to add to the collection.
     """  # pylint: disable=g-doc-exception
-    _assert_collection_is_ok(name)
     self._check_not_finalized()
     with self._lock:
       if name not in self._collections:
@@ -3929,7 +3928,6 @@ class Graph(object):
       The list of values in the collection with the given `name`, or an empty
       list if no value has been added to that collection.
     """  # pylint: disable=g-doc-exception
-    _assert_collection_is_ok(name)
     with self._lock:
       coll_list = self._collections.get(name, None)
       if coll_list is None:
@@ -3959,7 +3957,6 @@ class Graph(object):
       list contains the values in the order under which they were
       collected.
     """  # pylint: disable=g-doc-exception
-    _assert_collection_is_ok(name)
     with self._lock:
       collection = self._collections.get(name, None)
       if collection is None:
@@ -5822,7 +5819,8 @@ def add_to_collection(name, value):
     value: The value to add to the collection.
 
   @compatibility(eager)
-  Collections are not supported when eager execution is enabled.
+  Collections are only supported in eager when variables are created inside an
+  EagerVariableStore (e.g. as part of a layer or template).
   @end_compatibility
   """
   get_default_graph().add_to_collection(name, value)
@@ -5840,7 +5838,8 @@ def add_to_collections(names, value):
     value: The value to add to the collections.
 
   @compatibility(eager)
-  Collections are not supported when eager execution is enabled.
+  Collections are only supported in eager when variables are created inside an
+  EagerVariableStore (e.g. as part of a layer or template).
   @end_compatibility
   """
   get_default_graph().add_to_collections(names, value)
@@ -6133,14 +6132,6 @@ def get_from_proto_function(collection_name):
     return None
 
 
-def _assert_collection_is_ok(collection_name):
-  if context.executing_eagerly():
-    if collection_name in GraphKeys._VARIABLE_COLLECTIONS:  # pylint: disable=protected-access
-      raise ValueError(
-          "variable collections are not supported when eager execution is enabled."
-      )
-
-
 def _operation_conversion_error(op, dtype=None, name=None, as_ref=False):
   """Produce a nice error if someone converts an Operation to a Tensor."""
   raise TypeError(("Can't convert Operation '%s' to Tensor "
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 9dc4ec0f96..2ee53df931 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -197,6 +197,32 @@ class VariableScopeTest(test.TestCase):
         self.assertAllEqual([v1, v2], [v3, v4])
       f()
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerVariablesStoreAddsToCollections(self):
+    store = variable_scope.EagerVariableStore()
+    with store.as_default():
+      trainable = variable_scope.get_variable("v1", [], trainable=True)
+      not_trainable = variable_scope.get_variable("v2", [], trainable=False)
+      concat = variable_scope.get_variable(
+          "v3", [], collections=[ops.GraphKeys.CONCATENATED_VARIABLES])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES),
+          [trainable, not_trainable])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES),
+          [trainable, concat])
+      self.assertEqual(
+          ops.get_collection(ops.GraphKeys.CONCATENATED_VARIABLES), [concat])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerVariablesOutsideStoreNotAddedToCollections(self):
+    if not context.executing_eagerly():
+      return
+    variable_scope.get_variable("v1", [], trainable=True)
+    variable_scope.get_variable("v2", [], trainable=False)
+    self.assertFalse(ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
+    self.assertFalse(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
   @test_util.run_in_graph_and_eager_modes()
   def testInitFromNonTensorValue(self):
     v = variable_scope.get_variable("v4", initializer=4, dtype=dtypes.int32)
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 7061b32808..c137bfacb2 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -507,6 +507,9 @@ class ResourceVariable(variables.Variable):
           else:
             self._cached_value = None
         if not context.executing_eagerly():
+          # Eager variables are only added to collections if they are part of an
+          # eager variable store (otherwise in an interactive session they would
+          # hog memory and cause OOM). This is done in ops/variable_scope.py.
           ops.add_to_collections(collections, self)
         elif ops.GraphKeys.GLOBAL_STEP in collections:
           ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index fa34774622..23234e2e61 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -794,6 +794,14 @@ class _VariableStore(object):
         validate_shape=validate_shape,
         constraint=constraint,
         use_resource=use_resource)
+    if context.executing_eagerly() and self._store_eager_variables:
+      if collections:
+        ops.add_to_collections(collections, v)
+      else:
+        ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, v)
+      if trainable:
+        ops.add_to_collection(ops.GraphKeys.TRAINABLE_VARIABLES, v)
+
     if not context.executing_eagerly() or self._store_eager_variables:
       # In eager mode we do not want to keep default references to Variable
       # objects as this will prevent their memory from being released.
-- 
GitLab


From edd936e4ea1bd9f1f9ee05af92efc3bae5f1515a Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 4 Jun 2018 07:43:19 -0700
Subject: [PATCH 241/610] Temporary patch: properly handle expressions in
 subscripts. The long term fix is either of: (a) dropping support for tracking
 specific slices of a symbol (b) track slices along with the symbols on which
 they depend.

Background:
So far we tracked symbols like `a[b]` and allow conversions of the kind `if <cond>: a[b] = c` -> `a[b] = ag__.if_stmt(<cond>, lambda: c, lambda: a[b])`. That construct allowed a to be anything, including e.g. Python lists, objects. etc.
This is incomplete and will in the future become obsolete as we override the slice operator. In effect the statement above will be converted to `a = ag__.if_stmt(<cond>, lambda: ag__.set_item(a, b, c), lambda: a)`. However, this latter form does not support objects, so there is a tradeoff.
PiperOrigin-RevId: 199131573
---
 tensorflow/contrib/autograph/pyct/qual_names.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/autograph/pyct/qual_names.py b/tensorflow/contrib/autograph/pyct/qual_names.py
index 583cf7ecd7..da07013cf4 100644
--- a/tensorflow/contrib/autograph/pyct/qual_names.py
+++ b/tensorflow/contrib/autograph/pyct/qual_names.py
@@ -205,6 +205,7 @@ class QnResolver(gast.NodeTransformer):
     return node
 
   def visit_Subscript(self, node):
+    # TODO(mdan): This may no longer apply if we overload getitem.
     node = self.generic_visit(node)
     s = node.slice
     if not isinstance(s, gast.Index):
@@ -216,7 +217,11 @@ class QnResolver(gast.NodeTransformer):
     elif isinstance(s.value, gast.Str):
       subscript = QN(StringLiteral(s.value.s))
     else:
-      subscript = anno.getanno(node.slice.value, anno.Basic.QN)
+      # The index may be an expression, case in which a name doesn't make sense.
+      if anno.hasanno(node.slice.value, anno.Basic.QN):
+        subscript = anno.getanno(node.slice.value, anno.Basic.QN)
+      else:
+        return node
     if anno.hasanno(node.value, anno.Basic.QN):
       anno.setanno(node, anno.Basic.QN,
                    QN(anno.getanno(node.value, anno.Basic.QN),
-- 
GitLab


From 01c4773f435c556712c5465792f2936b5c762a1e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 07:52:01 -0700
Subject: [PATCH 242/610] [XLA:GPU] Add error message to CHECK for
 preconditions to lower fusions with multiple reduce outputs.

PiperOrigin-RevId: 199132442
---
 tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 0f5c003341..b40b557cab 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2443,8 +2443,11 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
       case HloOpcode::kReduce:
         return inst->operand(1);
       case HloOpcode::kTuple:
-        CHECK(hlo->IsMultiOutputFusion() &&
-              inst->operand(index.back())->opcode() == HloOpcode::kReduce);
+        CHECK(hlo->IsMultiOutputFusion())
+            << ": " << hlo->ToString() << " is not a multi-output fusion.";
+        CHECK(inst->operand(index.back())->opcode() == HloOpcode::kReduce)
+            << ": Found '" << inst->operand(index.back())->opcode() << "' in "
+            << inst->ToString() << " but expected 'reduce'.";
         // For multi-output fusion look through the tuple.
         return inst->operand(index.back())->operand(1);
       default:
-- 
GitLab


From 1b4336cd5ab851404d18976169d396247ec40f10 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 08:12:37 -0700
Subject: [PATCH 243/610] Add LRN as unchanged rf layer operations for the
 receptive field calculator.

PiperOrigin-RevId: 199134753
---
 .../receptive_field/python/util/parse_layer_parameters.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
index bc383a8034..0e3c46f17d 100644
--- a/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
+++ b/tensorflow/contrib/receptive_field/python/util/parse_layer_parameters.py
@@ -27,7 +27,7 @@ from tensorflow.python.platform import tf_logging as logging
 _UNCHANGED_RF_LAYER_OPS = [
     "Add", "BiasAdd", "Cast", "Ceil", "ConcatV2", "Const", "Floor",
     "FusedBatchNorm", "Identity", "Log", "Mul", "Pow", "RealDiv", "Relu",
-    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2"
+    "Relu6", "Round", "Rsqrt", "Softplus", "Sub", "VariableV2", "LRN"
 ]
 
 # Different ways in which padding modes may be spelled.
-- 
GitLab


From 1a9f69583876c50c98fc3ccd9ded1f81731a9492 Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 4 Jun 2018 09:00:06 -0700
Subject: [PATCH 244/610] Disable flaky test
 tensorflow/contrib/distribute/python:minimize_loss_test_gpu from continuous
 builds.

PiperOrigin-RevId: 199140117
---
 tensorflow/contrib/distribute/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 3118deaa47..a91c54153f 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -311,6 +311,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_pip",
+        "noguitar",  # TODO(b/109653107): test is flaky.
     ],
 )
 
-- 
GitLab


From 33c84aa99fab76ddce7e0a8a5420e8cd63cd2a76 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 16:04:12 +0000
Subject: [PATCH 245/610] Expose `tf.broadcast_to` op

This fix is a follow up of 15243 to expose `tf.broadcast_to`.
Previously the op was exposed as `tf.contrib.framework.broadcast_to.
This fix unhide the BroadcastTo so that it is exposed in `tf.broadcast_to`.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt | 4 ----
 1 file changed, 4 deletions(-)
 delete mode 100644 tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt

diff --git a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt b/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
deleted file mode 100644
index 083eeced81..0000000000
--- a/tensorflow/core/api_def/python_api/api_def_BroadcastTo.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BroadcastTo"
-  visibility: HIDDEN
-}
-- 
GitLab


From af3c646a03033db3074b5d6f6f40d2ead430a53d Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 16:06:19 +0000
Subject: [PATCH 246/610] Remove exposure of tf.contrib.framework.broadcast_to

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/contrib/framework/__init__.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tensorflow/contrib/framework/__init__.py b/tensorflow/contrib/framework/__init__.py
index 10d1ecc738..dc49383c5c 100644
--- a/tensorflow/contrib/framework/__init__.py
+++ b/tensorflow/contrib/framework/__init__.py
@@ -119,14 +119,13 @@ from tensorflow.python.framework.smart_cond import smart_cond
 from tensorflow.python.framework.smart_cond import smart_constant_value
 from tensorflow.python.framework.tensor_spec import BoundedTensorSpec
 from tensorflow.python.framework.tensor_spec import TensorSpec
-from tensorflow.python.ops.array_ops import broadcast_to
 from tensorflow.python.ops.init_ops import convolutional_delta_orthogonal
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_1d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_2d
 from tensorflow.python.ops.init_ops import convolutional_orthogonal_3d
 from tensorflow.python.util.all_util import remove_undocumented
 
-_allowed_symbols = ['nest', 'broadcast_to']
+_allowed_symbols = ['nest']
 _nest_allowed_symbols = [
     'assert_same_structure',
     'is_sequence',
-- 
GitLab


From a1e24ebca75ff21188c131f28952401d9708dd5e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 09:00:08 -0700
Subject: [PATCH 247/610] Internal change

PiperOrigin-RevId: 199140124
---
 tensorflow/core/kernels/resize_area_op_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/resize_area_op_test.cc b/tensorflow/core/kernels/resize_area_op_test.cc
index a7e06ef15a..84ff090b54 100644
--- a/tensorflow/core/kernels/resize_area_op_test.cc
+++ b/tensorflow/core/kernels/resize_area_op_test.cc
@@ -124,7 +124,8 @@ class ResizeAreaOpTest : public OpsTestBase {
                                   ? (j + 1 > in_x1 ? width_scale : j + 1 - in_x)
                                   : (j + 1 > in_x1 ? in_x1 - j : 1.0);
               for (int64 c = 0; c < channels; ++c) {
-#define BOUND(val, limit) std::min(((limit)-1ll), (std::max(0ll, (val))))
+#define BOUND(val, limit) \
+  std::min(((limit)-int64{1}), (std::max(int64{0}, (val))))
                 sum_data(c) +=
                     static_cast<float>(input_data(b, BOUND(i, in_height),
                                                   BOUND(j, in_width), c)) *
-- 
GitLab


From 736e8fa3b83ca801af64c1bbc8afabdf8a00436b Mon Sep 17 00:00:00 2001
From: Peter Hawkins <phawkins@google.com>
Date: Mon, 4 Jun 2018 09:09:32 -0700
Subject: [PATCH 248/610] Enable cross-device dependency grouping optimization
 in non-AGGRESSIVE modes.

PiperOrigin-RevId: 199141605
---
 .../optimizers/dependency_optimizer.cc        | 24 +++++++++++--------
 .../optimizers/dependency_optimizer_test.cc   |  2 +-
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
index fb2aea3b3d..78a6d0d835 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer.cc
@@ -581,7 +581,8 @@ void DependencyOptimizer::GroupCrossDeviceControlEdges() {
     for (int j = 0; j < node->input_size(); ++j) {
       if (IsControlInput(node->input(j))) {
         const NodeDef* input = node_map_->GetNode(node->input(j));
-        if (!input->device().empty() && input->device() != node->device()) {
+        if (input != nullptr && !input->device().empty() &&
+            input->device() != node->device()) {
           auto emplace_result = noops.emplace(input->device(), nullptr);
           if (!emplace_result.second &&
               emplace_result.first->second == nullptr) {
@@ -615,14 +616,19 @@ void DependencyOptimizer::GroupCrossDeviceControlEdges() {
       const string& input_name = node->input(pos);
       if (IsControlInput(input_name)) {
         NodeDef* input = node_map_->GetNode(input_name);
-        auto it = noops.find(input->device());
-        if (it == noops.end() || it->second == nullptr) {
+        if (input == nullptr) {
           ++pos;
         } else {
-          node->mutable_input()->SwapElements(pos, node->input_size() - 1);
-          node->mutable_input()->RemoveLast();
-          it->second->add_input(AsControlDependency(*input));
-          node_map_->UpdateOutput(input_name, node->name(), it->second->name());
+          auto it = noops.find(input->device());
+          if (it == noops.end() || it->second == nullptr) {
+            ++pos;
+          } else {
+            node->mutable_input()->SwapElements(pos, node->input_size() - 1);
+            node->mutable_input()->RemoveLast();
+            it->second->add_input(AsControlDependency(*input));
+            node_map_->UpdateOutput(input_name, node->name(),
+                                    it->second->name());
+          }
         }
       } else {
         ++pos;
@@ -669,9 +675,7 @@ Status DependencyOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // Dedup control inputs.
     CleanControlInputs();
 
-    if (opt_level_ == RewriterConfig::AGGRESSIVE) {
-      GroupCrossDeviceControlEdges();
-    }
+    GroupCrossDeviceControlEdges();
   }
 
   return Status::OK();
diff --git a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
index 931d073cd3..0ae3b4ec34 100644
--- a/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/dependency_optimizer_test.cc
@@ -774,7 +774,7 @@ TEST_F(DependencyOptimizerTest, GroupCrossDeviceControlDeps) {
     TF_CHECK_OK(s.ToGraphDef(&expected));
   }
 
-  DependencyOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+  DependencyOptimizer optimizer;
   GraphDef output;
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
   CompareGraphs(expected, output);
-- 
GitLab


From 077612963303c428a1effb9a8791537c131308c3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 09:14:49 -0700
Subject: [PATCH 249/610] Update the distributed SDCA test.

PiperOrigin-RevId: 199142338
---
 .../python/kernel_tests/sdca_ops_test.py      | 47 +++++++++++--------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index d0c32b43cc..ef0e08a777 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -377,7 +377,10 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         train_op.run()
 
   def testDistributedSimple(self):
-    # Setup test data
+    # Distributed SDCA may not converge if the workers update concurrently the
+    # same example. In this test the examples are partitioned across workers.
+    # The examples are the same for all workers, just the example_ids are
+    # different.
     example_protos = [
         make_example_proto({
             'age': [0],
@@ -389,13 +392,19 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
         }, 1),
     ]
     example_weights = [1.0, 1.0]
+    examples = make_example_dict(example_protos, example_weights)
+    example_ids = array_ops.placeholder(
+        dtypes.string, shape=(len(example_weights),))
+    examples['example_ids'] = example_ids
+    variables = make_variable_dict(1, 1)
     for num_shards in _SHARD_NUMBERS:
       for num_loss_partitions in _NUM_LOSS_PARTITIONS:
         with self._single_threaded_test_session():
-          examples = make_example_dict(example_protos, example_weights)
-          variables = make_variable_dict(1, 1)
           options = dict(
-              symmetric_l2_regularization=1,
+              # Keep the same solution as for TestSimple: since the number of
+              # examples is multplied by num_loss_partitions, multiply also
+              # L2 by the same value.
+              symmetric_l2_regularization=num_loss_partitions,
               symmetric_l1_regularization=0,
               loss_type='logistic_loss',
               num_table_shards=num_shards,
@@ -411,32 +420,30 @@ class SdcaWithLogisticLossTest(SdcaModelTest):
 
           train_op = lr.minimize()
 
-          def minimize():
+          def minimize(worker_id):
             with self._single_threaded_test_session():
+              feed_dict = {example_ids: [
+                  str(i + worker_id*len(example_weights)) for i in range(
+                      len(example_weights))]}
               for _ in range(_MAX_ITERATIONS):
-                train_op.run()  # pylint: disable=cell-var-from-loop
+                train_op.run(feed_dict=feed_dict)  # pylint: disable=cell-var-from-loop
 
           threads = []
-          for _ in range(num_loss_partitions):
-            threads.append(threading.Thread(target=minimize))
+          for worker_id in range(num_loss_partitions):
+            threads.append(threading.Thread(target=minimize, args=(worker_id,)))
             threads[-1].start()
 
           for t in threads:
             t.join()
-          lr.update_weights(train_op).run()
-
-          # The high tolerance in unregularized_loss comparisons is due to the
-          # fact that it's possible to trade off unregularized_loss vs.
-          # regularization and still have a sum that is quite close to the
-          # optimal regularized_loss value.  SDCA's duality gap only ensures
-          # that the regularized_loss is within 0.01 of optimal.
-          # 0.525457 is the optimal regularized_loss.
-          # 0.411608 is the unregularized_loss at that optimum.
-          self.assertAllClose(0.411608, unregularized_loss.eval(), atol=0.05)
-          self.assertAllClose(0.525457, loss.eval(), atol=0.01)
+          lr.update_weights(train_op).run(feed_dict={
+              example_ids: [str(i) for i in range(len(example_weights))]})
+
+          # Test only the unregularized loss because the optimal value of the
+          # regularized loss depends on num_loss_partitions.
+          self.assertAllClose(0.411608, unregularized_loss.eval(), atol=0.02)
           predicted_labels = get_binary_predictions_for_logistic(predictions)
           self.assertAllEqual([0, 1], predicted_labels.eval())
-          self.assertTrue(lr.approximate_duality_gap().eval() < 0.02)
+          self.assertNear(0.0, lr.approximate_duality_gap().eval(), 0.02)
 
   def testSimpleNoL2(self):
     # Same as test above (so comments from above apply) but without an L2.
-- 
GitLab


From 52f3f70b8bd6953e3f2437289ac078d5a1f439d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 09:39:17 -0700
Subject: [PATCH 250/610] Build TF on Windows with --config=opt

--config=opt will enable /arch:AVX cc option on Windows

-c opt is already specified in tools/bazel.rc, no it's OK to remove it here

PiperOrigin-RevId: 199145562
---
 tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 73520bb2ac..1b1c3815d8 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -77,7 +77,7 @@ echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
 
 run_configure_for_cpu_build
 
-bazel build --announce_rc -c opt tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$skip_test" == 1 ]]; then
   exit 0
@@ -98,7 +98,7 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 
 # Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
 # which will result testing system installed tensorflow
-bazel test -c opt -k --test_output=errors \
+bazel test --announce_rc --config=opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
   --test_tag_filters=-no_pip,-no_windows,-no_oss \
   --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
-- 
GitLab


From dc14f35972c8757ab65cdb54f0797e548fe3a579 Mon Sep 17 00:00:00 2001
From: mrTsjolder <mrtsjolder@gmail.com>
Date: Mon, 4 Jun 2018 18:42:33 +0200
Subject: [PATCH 251/610] Fix variance initialisers (#18854)

* Fix std in variance_scaling initialiser

* style improvement variance fix

* clean up (own) tests

* revert irrelevant changes to tests

* fix keras initializers_test
---
 tensorflow/python/keras/initializers_test.py  | 26 +++++++++---------
 .../python/kernel_tests/init_ops_test.py      | 27 +++++++++++++++++++
 tensorflow/python/ops/init_ops.py             |  3 ++-
 3 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index a54d6da839..c519e194bd 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -71,7 +71,7 @@ class KerasInitializersTest(test.TestCase):
                                                       stddev=1,
                                                       seed=126),
                    tensor_shape,
-                   target_mean=0., target_std=None, target_max=2)
+                   target_mean=0., target_max=2, target_min=-2)
 
   def test_constant(self):
     tensor_shape = (5, 6, 4)
@@ -83,49 +83,49 @@ class KerasInitializersTest(test.TestCase):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(3. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(6. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
-                   target_mean=0., target_max=scale, target_min=-scale)
+                   target_mean=0., target_std=std)
 
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(1. / fan_in)
+      std = np.sqrt(1. / fan_in)
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, fan_out = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / (fan_in + fan_out))
+      std = np.sqrt(2. / (fan_in + fan_out))
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.test_session():
       fan_in, _ = init_ops._compute_fans(tensor_shape)
-      scale = np.sqrt(2. / fan_in)
+      std = np.sqrt(2. / fan_in)
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
-                   target_mean=0., target_std=None, target_max=2 * scale)
+                   target_mean=0., target_std=std)
 
   def test_orthogonal(self):
     tensor_shape = (20, 20)
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index a9b55854f1..795aa67248 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -362,6 +362,33 @@ class UniformUnitScalingInitializationTest(test.TestCase):
         dtype=dtypes.string)
 
 
+class VarianceScalingInitializationTest(test.TestCase):
+
+  def testNormalDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='normal')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+  def testUniformDistribution(self):
+    shape = [100, 100]
+    expect_mean = 0.
+    expect_var = 1. / shape[0]
+    init = init_ops.variance_scaling_initializer(distribution='uniform')
+
+    with self.test_session(use_gpu=True):
+      x = init(shape).eval()
+
+    self.assertNear(np.mean(x), expect_mean, err=1e-2)
+    self.assertNear(np.var(x), expect_var, err=1e-2)
+
+
 # TODO(vrv): move to sequence_ops_test?
 class RangeTest(test.TestCase):
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 1f8d8dc4f3..055d42815c 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -463,7 +463,8 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal":
-      stddev = math.sqrt(scale)
+      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+      stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
     else:
-- 
GitLab


From 301e800623b3a463267c09e8be43972af609d710 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Branchaud-Charron?=
 <frederic.branchaud-charron@usherbrooke.ca>
Date: Mon, 4 Jun 2018 12:42:48 -0400
Subject: [PATCH 252/610] Add globs from Lambda before calling it (#18926)

---
 tensorflow/python/estimator/keras_test.py | 14 ++++++------
 tensorflow/python/keras/layers/core.py    | 26 ++++++++++++++++++++++-
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/estimator/keras_test.py b/tensorflow/python/estimator/keras_test.py
index 6688a84130..5e094ae92b 100644
--- a/tensorflow/python/estimator/keras_test.py
+++ b/tensorflow/python/estimator/keras_test.py
@@ -31,10 +31,10 @@ from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.applications import mobilenet
 from tensorflow.python.keras.optimizers import SGD
+from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
@@ -146,13 +146,13 @@ def randomize_io_type(array, name):
 def multi_inputs_multi_outputs_model():
   a = keras.layers.Input(shape=(16,), name='input_a')
   b = keras.layers.Input(shape=(16,), name='input_b')
-  m = keras.layers.Input(shape=(8,), dtype='bool', name='input_m')
+  m = keras.layers.Input(shape=(8,), dtype='string', name='input_m')
   dense = keras.layers.Dense(8, name='dense_1')
 
   a_2 = dense(a)
-  # Apply a mask
-  s_2 = keras.layers.Lambda(lambda k:
-                            K.switch(k[0], k[1], K.zeros_like(k[1])))([m, a_2])
+  # Read m
+  m_2 = keras.layers.Lambda(gen_parsing_ops.string_to_number)(m)
+  s_2 = keras.layers.Lambda(lambda k: k[0] * k[1])([m_2, a_2])
   b_2 = dense(b)
   merged = keras.layers.concatenate([s_2, b_2], name='merge')
   c = keras.layers.Dense(3, activation='softmax', name='dense_2')(merged)
@@ -372,13 +372,13 @@ class TestKerasEstimator(test_util.TensorFlowTestCase):
 
     def train_input_fn():
       input_dict = {'input_a': a_train, 'input_b': b_train,
-                    'input_m': input_m_train > 0}
+                    'input_m': input_m_train.astype(np.str)}
       output_dict = {'dense_2': c_train, 'dense_3': d_train}
       return input_dict, output_dict
 
     def eval_input_fn():
       input_dict = {'input_a': a_test, 'input_b': b_test,
-                    'input_m': input_m_test > 0}
+                    'input_m': input_m_test.astype(np.str)}
       output_dict = {'dense_2': c_test, 'dense_3': d_test}
       return input_dict, output_dict
 
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index df4c3915a3..db0c220380 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import sys
 import types as python_types
+import warnings
 
 import numpy as np
 
@@ -714,6 +716,7 @@ class Lambda(Layer):
     return self.mask
 
   def get_config(self):
+    module = self.function.__module__
     if isinstance(self.function, python_types.LambdaType):
       function = generic_utils.func_dump(self.function)
       function_type = 'lambda'
@@ -721,21 +724,26 @@ class Lambda(Layer):
       function = self.function.__name__
       function_type = 'function'
 
+    output_shape_module = None
     if isinstance(self._output_shape, python_types.LambdaType):
       output_shape = generic_utils.func_dump(self._output_shape)
       output_shape_type = 'lambda'
+      output_shape_module = self._output_shape.__module__
     elif callable(self._output_shape):
       output_shape = self._output_shape.__name__
       output_shape_type = 'function'
+      output_shape_module = self._output_shape.__module__
     else:
       output_shape = self._output_shape
       output_shape_type = 'raw'
 
     config = {
         'function': function,
+        'module': module,
         'function_type': function_type,
         'output_shape': output_shape,
         'output_shape_type': output_shape_type,
+        'output_shape_module': output_shape_module,
         'arguments': self.arguments
     }
     base_config = super(Lambda, self).get_config()
@@ -745,8 +753,16 @@ class Lambda(Layer):
   def from_config(cls, config, custom_objects=None):
     config = config.copy()
     globs = globals()
+    module = config.pop('module', None)
+    if module in sys.modules:
+      globs.update(sys.modules[module].__dict__)
+    elif module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(module)
+                    , UserWarning)
     if custom_objects:
-      globs = dict(list(globs.items()) + list(custom_objects.items()))
+      globs.update(custom_objects)
     function_type = config.pop('function_type')
     if function_type == 'function':
       # Simple lookup in custom objects
@@ -760,6 +776,14 @@ class Lambda(Layer):
     else:
       raise TypeError('Unknown function type:', function_type)
 
+    output_shape_module = config.pop('output_shape_module', None)
+    if output_shape_module in sys.modules:
+      globs.update(sys.modules[output_shape_module].__dict__)
+    elif output_shape_module is not None:
+      # Note: we don't know the name of the function if it's a lambda.
+      warnings.warn('{} is not loaded, but a Lambda layer uses it. '
+                    'It may cause errors.'.format(output_shape_module)
+                    , UserWarning)
     output_shape_type = config.pop('output_shape_type')
     if output_shape_type == 'function':
       # Simple lookup in custom objects
-- 
GitLab


From a3b9e75063201c78c75e2f717a2ff24b0ffa6f44 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Yan=20Facai=20=28=E9=A2=9C=E5=8F=91=E6=89=8D=29?=
 <facai.yan@gmail.com>
Date: Tue, 5 Jun 2018 00:43:00 +0800
Subject: [PATCH 253/610] DOC: add more explanation for auxiliary_name_scope
 (#18948)

---
 tensorflow/python/ops/variable_scope.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index fa34774622..9c969d61c0 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -1778,6 +1778,23 @@ class variable_scope(object):
           assert v.name == "foo/bar/v:0"
   ```
 
+  Simple example of how to reenter a premade variable scope safely:
+
+  ```python
+  with tf.variable_scope("foo") as vs:
+    pass
+
+  # Re-enter the variable scope.
+  with tf.variable_scope(vs,
+                         auxiliary_name_scope=False) as vs1:
+    # Restore the original name_scope.
+    with tf.name_scope(vs1.original_name_scope):
+        v = tf.get_variable("v", [1])
+        assert v.name == "foo/v:0"
+        c = tf.constant([1], name="c")
+        assert c.name == "foo/c:0"
+  ```
+
   Basic example of sharing a variable AUTO_REUSE:
 
   ```python
@@ -1915,7 +1932,9 @@ class variable_scope(object):
         (which must have the same shape). Constraints are not safe to
         use when doing asynchronous distributed training.
       auxiliary_name_scope: If `True`, we create an auxiliary name scope with
-        the scope. If `False`, we don't touch name scope.
+        the scope. If `False`, we don't create it. Note that the argument is
+        not inherited, and it only takes effect for once when creating. You
+        should only use it for re-entering a premade variable scope.
 
     Returns:
       A scope that can be captured and reused.
-- 
GitLab


From 440e3850bd197332876f391e79cf06c723d69885 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 09:44:20 -0700
Subject: [PATCH 254/610] Fix issue in Keras model complie with float64 mode
 (#19328)

* Fix issue in Keras model complie with float64 mode

This fix tries to address the issue raised in 19318 where
Keras model complie for `model.compile('rmsprop', 'mse')`
does not work in float64 mode.

The issue comes from `placeholder_with_default([1.]...`, which
returns dtype float32 by default (as `[1.]` was inteprated
as float32). Since placeholder does not have a output_dtype to pass,
this fix converts `[1.]` to float64 first before passing in.

This fix fixes 19318.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Fix pylint issue

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for float64 and model compile

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/python/keras/engine/training.py |  7 +++++--
 tensorflow/python/keras/models_test.py     | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 04a2aa7664..aca63f822b 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -409,11 +410,13 @@ class Model(Network):
         else:
           if sample_weight_mode == 'temporal':
             sample_weights.append(array_ops.placeholder_with_default(
-                [[1.]], shape=[None, None], name=name + '_sample_weights'))
+                constant_op.constant([[1.]], dtype=K.floatx()),
+                shape=[None, None], name=name + '_sample_weights'))
             sample_weight_modes.append('temporal')
           else:
             sample_weights.append(array_ops.placeholder_with_default(
-                [1.], shape=[None], name=name + '_sample_weights'))
+                constant_op.constant([1.], dtype=K.floatx()),
+                shape=[None], name=name + '_sample_weights'))
             sample_weight_modes.append(None)
     self.sample_weight_modes = sample_weight_modes
     self._feed_sample_weight_modes = []
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index c616d8f24f..e6e45902a8 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -144,5 +144,19 @@ class CheckpointingTests(test.TestCase):
     model.load_weights(save_prefix)
     self.assertEqual(12., self.evaluate(beta1_power))
 
+class TestModelBackend(test.TestCase):
+
+  def test_model_backend_float64_use_cases(self):
+    # Test case for GitHub issue 19318
+    floatx = keras.backend.floatx()
+    keras.backend.set_floatx('float64')
+
+    x = keras.Input((5,))
+    y = keras.layers.Dense(1)(x)
+    model = keras.models.Model(x, y)
+    model.compile('rmsprop', 'mse')
+
+    keras.backend.set_floatx(floatx)
+
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From b940fb6ac1234d73fbb50053edf21600bacdda18 Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Mon, 4 Jun 2018 16:46:03 +0000
Subject: [PATCH 255/610] Update golden API

The golden API is updated with:
```
bazel-bin/tensorflow/tools/api/tests/api_compatibility_test \
          --update_goldens True
```

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/tools/api/golden/tensorflow.pbtxt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/tools/api/golden/tensorflow.pbtxt b/tensorflow/tools/api/golden/tensorflow.pbtxt
index 3051c4437e..01b8058118 100644
--- a/tensorflow/tools/api/golden/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.pbtxt
@@ -792,6 +792,10 @@ tf_module {
     name: "broadcast_static_shape"
     argspec: "args=[\'shape_x\', \'shape_y\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'input\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "case"
     argspec: "args=[\'pred_fn_pairs\', \'default\', \'exclusive\', \'strict\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'False\', \'case\'], "
-- 
GitLab


From b5f1ba290053893376bea31b8c4629b7efcd8c0a Mon Sep 17 00:00:00 2001
From: Yanan Cao <ycao@google.com>
Date: Mon, 4 Jun 2018 09:56:21 -0700
Subject: [PATCH 256/610] Minor error message fix in TPUEstimator.

PiperOrigin-RevId: 199148136
---
 tensorflow/contrib/tpu/python/tpu/tpu_estimator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index a155de3844..f63e9e8bda 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -2641,7 +2641,7 @@ class _CapturedObject(object):
   def capture(self, o):
     if self._captured:
       raise RuntimeError(
-          'InternalError: Object can be captured only. Please file bug .')
+          'InternalError: Object can capture only once. Please file bug.')
 
     self._captured = True
     self._object = o
@@ -2650,7 +2650,7 @@ class _CapturedObject(object):
     if not self._captured:
       raise RuntimeError(
           'InternalError: Object is not captured properly before `get`. '
-          'Please file bug .')
+          'Please file bug.')
     return self._object
 
 
-- 
GitLab


From f277fb608d5e278d04e81b82f57b69afe723d973 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Mon, 4 Jun 2018 10:24:33 -0700
Subject: [PATCH 257/610] [TF2XLA] Change to resize bilinear to between match a
 BackpropInput convolution by swapping the kernel input and output feature
 dimension.

PiperOrigin-RevId: 199153010
---
 tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 91bff995a1..79d3a6979c 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -197,8 +197,8 @@ xla::XlaOp ResizeUsingDilationAndConvolution(xla::XlaBuilder* builder,
     dimension_numbers.add_output_spatial_dimensions(1 + i);
     dimension_numbers.add_kernel_spatial_dimensions(i);
   }
-  dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims);
-  dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims + 1);
+  dimension_numbers.set_kernel_input_feature_dimension(num_spatial_dims + 1);
+  dimension_numbers.set_kernel_output_feature_dimension(num_spatial_dims);
 
   ResizeConvolutionDims dims =
       ComputeResizeConvolutionParameters(in_size, out_size);
-- 
GitLab


From 4a1197c4c09ca4383cf7fc24c08d83a1641c7735 Mon Sep 17 00:00:00 2001
From: G K <klambauer@bioinf.jku.at>
Date: Mon, 4 Jun 2018 19:30:17 +0200
Subject: [PATCH 258/610] added crucial documentation on SELU activation
 (#15337)

* added crucial documentation on SELU activation

* changed from layers. to tf.
---
 tensorflow/core/api_def/base_api/api_def_Selu.pbtxt | 4 ++++
 tensorflow/go/op/wrappers.go                        | 6 +++---
 tensorflow/python/keras/activations.py              | 2 ++
 3 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
index cbe76de415..985f09312f 100644
--- a/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_Selu.pbtxt
@@ -4,6 +4,10 @@ op {
   description: <<END
 if < 0, `scale * features` otherwise.
 
+To be used together with
+`initializer = tf.variance_scaling_initializer(factor=1.0, mode='FAN_IN')`.
+For correct dropout, use `tf.contrib.nn.alpha_dropout`.
+
 See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 END
 }
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index f082d84858..62a2e01c8a 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -18101,9 +18101,10 @@ func SparseFillEmptyRowsGrad(scope *Scope, reverse_index_map tf.Output, grad_val
 }
 
 // Computes scaled exponential linear: `scale * alpha * (exp(features) - 1)`
-//
 // if < 0, `scale * features` otherwise.
 //
+// Assumes weights to have zero mean and variance 1.0 / fan_in.
+//
 // See [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
 func Selu(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -24359,8 +24360,7 @@ type DecodeProtoV2Attr func(optionalAttr)
 // If not specified, defaults to "local://"
 func DecodeProtoV2DescriptorSource(value string) DecodeProtoV2Attr {
 	return func(m optionalAttr) {
-		m["descriptor_source"] = value
-	}
+		m["descriptor_source"] = value	}
 }
 
 // DecodeProtoV2MessageFormat sets the optional message_format attribute to value.
diff --git a/tensorflow/python/keras/activations.py b/tensorflow/python/keras/activations.py
index a62dadb830..92ad7c7e36 100644
--- a/tensorflow/python/keras/activations.py
+++ b/tensorflow/python/keras/activations.py
@@ -71,6 +71,8 @@ def selu(x):
       - To be used together with the initialization "lecun_normal".
       - To be used together with the dropout variant "AlphaDropout".
 
+  References:
+      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
   """
   alpha = 1.6732632423543772848170429916717
   scale = 1.0507009873554804934193349852946
-- 
GitLab


From f4048e5e4942137ee1a4103f54935538450e1a0a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 10:25:23 -0700
Subject: [PATCH 259/610] Computing the volume of the set of correlation
 matrices with bounded determinant.

This is useful for testing the LKJ distribution on correlation matrices.

PiperOrigin-RevId: 199153115
---
 .../python/kernel_tests/util/BUILD            |  48 +++
 .../util/correlation_matrix_volumes.py        |  98 ++++++
 .../util/correlation_matrix_volumes_lib.py    | 323 ++++++++++++++++++
 .../util/correlation_matrix_volumes_test.py   | 150 ++++++++
 4 files changed, 619 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/util/BUILD
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/BUILD b/tensorflow/contrib/distributions/python/kernel_tests/util/BUILD
new file mode 100644
index 0000000000..03e26b198e
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/BUILD
@@ -0,0 +1,48 @@
+# Description:
+#   Internal testing utilities, e.g., computing the correct answer to
+#   put in a unit test.
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "correlation_matrix_volumes_py",
+    srcs = [
+        "correlation_matrix_volumes_lib.py",
+    ],
+    deps = [
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_binary(
+    name = "correlation_matrix_volumes",
+    srcs = [
+        "correlation_matrix_volumes.py",
+    ],
+    deps = [
+        ":correlation_matrix_volumes_py",
+    ],
+)
+
+py_test(
+    name = "correlation_matrix_volumes_test",
+    size = "medium",
+    srcs = ["correlation_matrix_volumes_test.py"],
+    tags = ["no_pip"],
+    deps = [
+        ":correlation_matrix_volumes_py",
+        # For statistical testing
+        "//tensorflow/contrib/distributions:distributions_py",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:check_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+    ],
+)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py
new file mode 100644
index 0000000000..2eab51cd30
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes.py
@@ -0,0 +1,98 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Executable to estimate the volume of various sets of correlation matrices.
+
+See correlation_matrix_volumes_lib.py for purpose and methodology.
+
+Invocation example:
+```
+python correlation_matrix_volumes.py --num_samples 1e7
+```
+
+This will compute 10,000,000-sample confidence intervals for the
+volumes of several sets of correlation matrices.  Which sets, and the
+desired statistical significance, are hard-coded in this source file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import pprint
+
+from absl import app
+from absl import flags
+
+from tensorflow.contrib.distributions.python.kernel_tests.util import correlation_matrix_volumes_lib as corr
+
+FLAGS = flags.FLAGS
+
+# Float to support giving the number of samples in scientific notation.
+# The production run used for the LKJ test used 1e7 samples.
+flags.DEFINE_float('num_samples', 1e4, 'Number of samples to use.')
+
+
+def ctv_debatched(det_bounds, dim, num_samples, error_rate=1e-6, seed=42):
+  # This wrapper undoes the batching in compute_true_volumes, because
+  # apparently several 5x5x9x1e7 Tensors of float32 can strain RAM.
+  bounds = {}
+  for db in det_bounds:
+    bounds[db] = corr.compute_true_volumes(
+        [db], dim, num_samples, error_rate=error_rate, seed=seed)[db]
+  return bounds
+
+
+# The particular bounds in all three of these functions were chosen by
+# a somewhat arbitrary walk through an empirical tradeoff, for the
+# purpose of testing the LKJ distribution.  Setting the determinant
+# bound lower
+# - Covers more of the testee's sample space, and
+# - Increases the probability that the rejection sampler will hit, thus
+# - Decreases the relative error (at a fixed sample count) in the
+#   rejection-based volume estimate;
+# but also
+# - Increases the variance of the estimator used in the LKJ test.
+# This latter variance is also affected by the dimension and the
+# tested concentration parameter, and can be compensated for with more
+# compute (expensive) or a looser discrepancy limit (unsatisfying).
+# The values here are the projection of the points in that test design
+# space that ended up getting chosen.
+def compute_3x3_volumes(num_samples):
+  det_bounds = [0.01, 0.25, 0.3, 0.35, 0.4, 0.45]
+  return ctv_debatched(
+      det_bounds, 3, num_samples, error_rate=5e-7, seed=46)
+
+
+def compute_4x4_volumes(num_samples):
+  det_bounds = [0.01, 0.25, 0.3, 0.35, 0.4, 0.45]
+  return ctv_debatched(
+      det_bounds, 4, num_samples, error_rate=5e-7, seed=47)
+
+
+def compute_5x5_volumes(num_samples):
+  det_bounds = [0.01, 0.2, 0.25, 0.3, 0.35, 0.4]
+  return ctv_debatched(
+      det_bounds, 5, num_samples, error_rate=5e-7, seed=48)
+
+
+def main(_):
+  full_bounds = {}
+  full_bounds[3] = compute_3x3_volumes(int(FLAGS.num_samples))
+  full_bounds[4] = compute_4x4_volumes(int(FLAGS.num_samples))
+  full_bounds[5] = compute_5x5_volumes(int(FLAGS.num_samples))
+  pprint.pprint(full_bounds)
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py
new file mode 100644
index 0000000000..455e71f00c
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_lib.py
@@ -0,0 +1,323 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Estimating the volume of the correlation matrices with bounded determinant.
+
+Why?  Because lkj_test.py tests the sampler for the LKJ distribution
+by estimating the same volume another way.
+
+How?  Rejection sampling.  Or, more precisely, importance sampling,
+proposing from the uniform distribution on symmetric matrices with
+diagonal 1s and entries in [-1, 1].  Such a matrix is a correlation
+matrix if and only if it is also positive semi-definite.
+
+The samples can then be converted into a confidence interval on the
+volume in question by the [Clopper-Pearson
+method](https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval),
+also implemented here.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import importlib
+import sys
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import uniform
+from tensorflow.python.ops.distributions import util
+from tensorflow.python.platform import tf_logging
+
+__all__ = [
+    "correlation_matrix_volume_rejection_samples",
+    "compute_true_volumes",
+]
+
+
+def try_import(name):  # pylint: disable=invalid-name
+  module = None
+  try:
+    module = importlib.import_module(name)
+  except ImportError as e:
+    tf_logging.warning("Could not import %s: %s" % (name, str(e)))
+  return module
+
+optimize = try_import("scipy.optimize")
+stats = try_import("scipy.stats")
+
+
+def _psd_mask(x):
+  """Computes whether each square matrix in the input is positive semi-definite.
+
+  Args:
+    x: A floating-point `Tensor` of shape `[B1, ..., Bn, M, M]`.
+
+  Returns:
+    mask: A floating-point `Tensor` of shape `[B1, ... Bn]`.  Each
+      scalar is 1 if the corresponding matrix was PSD, otherwise 0.
+  """
+  # Allegedly
+  # https://scicomp.stackexchange.com/questions/12979/testing-if-a-matrix-is-positive-semi-definite
+  # it is more efficient to test for positive semi-definiteness by
+  # trying to compute the Cholesky decomposition -- the matrix is PSD
+  # if you succeed and not PSD if you fail.  However, TensorFlow's
+  # Cholesky raises an exception if _any_ of the input matrices are
+  # not PSD, from which I don't know how to extract _which ones_, so I
+  # proceed by explicitly computing all the eigenvalues and checking
+  # whether they are all positive or not.
+  #
+  # Also, as was discussed in the answer, it is somewhat dangerous to
+  # treat SPD-ness as binary in floating-point arithmetic. Cholesky
+  # factorization can complete and 'look' like everything is fine
+  # (e.g., O(1) entries and a diagonal of all ones) but the matrix can
+  # have an exponential condition number.
+  eigenvalues, _ = linalg_ops.self_adjoint_eig(x)
+  return math_ops.cast(
+      math_ops.reduce_min(eigenvalues, axis=-1) >= 0, dtype=x.dtype)
+
+
+def _det_large_enough_mask(x, det_bounds):
+  """Returns whether the input matches the given determinant limit.
+
+  Args:
+    x: A floating-point `Tensor` of shape `[B1, ..., Bn, M, M]`.
+    det_bounds: A floating-point `Tensor` that must broadcast to shape
+      `[B1, ..., Bn]`, giving the desired lower bound on the
+      determinants in `x`.
+
+  Returns:
+    mask: A floating-point `Tensor` of shape [B1, ..., Bn].  Each
+      scalar is 1 if the corresponding matrix had determinant above
+      the corresponding bound, otherwise 0.
+  """
+  # For the curious: I wonder whether it is possible and desirable to
+  # use a Cholesky decomposition-based algorithm for this, since the
+  # only matrices whose determinant this code cares about will be PSD.
+  # Didn't figure out how to code that in TensorFlow.
+  #
+  # Expert opinion is that it would be about twice as fast since
+  # Cholesky is roughly half the cost of Gaussian Elimination with
+  # Partial Pivoting. But this is less of an impact than the switch in
+  # _psd_mask.
+  return math_ops.cast(
+      linalg_ops.matrix_determinant(x) > det_bounds, dtype=x.dtype)
+
+
+def _uniform_correlation_like_matrix(num_rows, batch_shape, dtype, seed):
+  """Returns a uniformly random `Tensor` of "correlation-like" matrices.
+
+  A "correlation-like" matrix is a symmetric square matrix with all entries
+  between -1 and 1 (inclusive) and 1s on the main diagonal.  Of these,
+  the ones that are positive semi-definite are exactly the correlation
+  matrices.
+
+  Args:
+    num_rows: Python `int` dimension of the correlation-like matrices.
+    batch_shape: `Tensor` or Python `tuple` of `int` shape of the
+      batch to return.
+    dtype: `dtype` of the `Tensor` to return.
+    seed: Random seed.
+
+  Returns:
+    matrices: A `Tensor` of shape `batch_shape + [num_rows, num_rows]`
+      and dtype `dtype`.  Each entry is in [-1, 1], and each matrix
+      along the bottom two dimensions is symmetric and has 1s on the
+      main diagonal.
+  """
+  num_entries = num_rows * (num_rows + 1) / 2
+  ones = array_ops.ones(shape=[num_entries], dtype=dtype)
+  # It seems wasteful to generate random values for the diagonal since
+  # I am going to throw them away, but `fill_triangular` fills the
+  # diagonal, so I probably need them.
+  # It's not impossible that it would be more efficient to just fill
+  # the whole matrix with random values instead of messing with
+  # `fill_triangular`.  Then would need to filter almost half out with
+  # `matrix_band_part`.
+  unifs = uniform.Uniform(-ones, ones).sample(batch_shape, seed=seed)
+  tril = util.fill_triangular(unifs)
+  symmetric = tril + array_ops.matrix_transpose(tril)
+  diagonal_ones = array_ops.ones(
+      shape=util.pad(batch_shape, axis=0, back=True, value=num_rows),
+      dtype=dtype)
+  return array_ops.matrix_set_diag(symmetric, diagonal_ones)
+
+
+def correlation_matrix_volume_rejection_samples(
+    det_bounds, dim, sample_shape, dtype, seed):
+  """Returns rejection samples from trying to get good correlation matrices.
+
+  The proposal being rejected from is the uniform distribution on
+  "correlation-like" matrices.  We say a matrix is "correlation-like"
+  if it is a symmetric square matrix with all entries between -1 and 1
+  (inclusive) and 1s on the main diagonal.  Of these, the ones that
+  are positive semi-definite are exactly the correlation matrices.
+
+  The rejection algorithm, then, is to sample a `Tensor` of
+  `sample_shape` correlation-like matrices of dimensions `dim` by
+  `dim`, and check each one for (i) being a correlation matrix (i.e.,
+  PSD), and (ii) having determinant at least the corresponding entry
+  of `det_bounds`.
+
+  Args:
+    det_bounds: A `Tensor` of lower bounds on the determinants of
+      acceptable matrices.  The shape must broadcast with `sample_shape`.
+    dim: A Python `int` dimension of correlation matrices to sample.
+    sample_shape: Python `tuple` of `int` shape of the samples to
+      compute, excluding the two matrix dimensions.
+    dtype: The `dtype` in which to do the computation.
+    seed: Random seed.
+
+  Returns:
+    weights: A `Tensor` of shape `sample_shape`.  Each entry is 0 if the
+      corresponding matrix was not a correlation matrix, or had too
+      small of a determinant.  Otherwise, the entry is the
+      multiplicative inverse of the density of proposing that matrix
+      uniformly, i.e., the volume of the set of `dim` by `dim`
+      correlation-like matrices.
+    volume: The volume of the set of `dim` by `dim` correlation-like
+      matrices.
+  """
+  with ops.name_scope("rejection_sampler"):
+    rej_proposals = _uniform_correlation_like_matrix(
+        dim, sample_shape, dtype, seed=seed)
+    rej_proposal_volume = 2. ** (dim * (dim - 1) / 2.)
+    # The density of proposing any given point is 1 / rej_proposal_volume;
+    # The weight of that point should be scaled by
+    # 1 / density = rej_proposal_volume.
+    rej_weights = rej_proposal_volume * _psd_mask(
+        rej_proposals) * _det_large_enough_mask(rej_proposals, det_bounds)
+    return rej_weights, rej_proposal_volume
+
+
+def _clopper_pearson_confidence_interval(samples, error_rate):
+  """Computes a confidence interval for the mean of the given 1-D distribution.
+
+  Assumes (and checks) that the given distribution is Bernoulli, i.e.,
+  takes only two values.  This licenses using the CDF of the binomial
+  distribution for the confidence, which is tighter (for extreme
+  probabilities) than the DKWM inequality.  The method is known as the
+  [Clopper-Pearson method]
+  (https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval).
+
+  Assumes:
+
+  - The given samples were drawn iid from the distribution of interest.
+
+  - The given distribution is a Bernoulli, i.e., supported only on
+    low and high.
+
+  Guarantees:
+
+  - The probability (over the randomness of drawing the given sample)
+    that the true mean is outside the returned interval is no more
+    than the given error_rate.
+
+  Args:
+    samples: `np.ndarray` of samples drawn iid from the distribution
+      of interest.
+    error_rate: Python `float` admissible rate of mistakes.
+
+  Returns:
+    low: Lower bound of confidence interval.
+    high: Upper bound of confidence interval.
+
+  Raises:
+    ValueError: If `samples` has rank other than 1 (batch semantics
+      are not implemented), or if `samples` contains values other than
+      `low` or `high` (as that makes the distribution not Bernoulli).
+  """
+  # TODO(b/78025336) Migrate this confidence interval function
+  # to statistical_testing.py.  In order to do that
+  # - Get the binomial CDF from the Binomial distribution
+  # - Implement scalar root finding in TF.  Batch bisection search
+  #   shouldn't be too hard, and is definitely good enough for this
+  #   problem.  Batching the Brent algorithm (from scipy) that is used
+  #   here may be more involved, but may also not be necessary---it's
+  #   only used here because scipy made it convenient.  In particular,
+  #   robustness is more important than speed here, which may make
+  #   bisection search actively better.
+  # - The rest is just a matter of rewriting in the appropriate style.
+  if optimize is None or stats is None:
+    raise ValueError(
+        "Scipy is required for computing Clopper-Pearson confidence intervals")
+  if len(samples.shape) != 1:
+    raise ValueError("Batch semantics not implemented")
+  n = len(samples)
+  low = np.amin(samples)
+  high = np.amax(samples)
+  successes = np.count_nonzero(samples - low)
+  failures = np.count_nonzero(samples - high)
+  if successes + failures != n:
+    uniques = np.unique(samples)
+    msg = ("Purportedly Bernoulli distribution had distinct samples"
+           " {}, {}, and {}".format(uniques[0], uniques[1], uniques[2]))
+    raise ValueError(msg)
+  def p_small_enough(p):
+    prob = stats.binom.logcdf(successes, n, p)
+    return prob - np.log(error_rate / 2.)
+  def p_big_enough(p):
+    prob = stats.binom.logsf(successes, n, p)
+    return prob - np.log(error_rate / 2.)
+  high_p = optimize.brentq(
+      p_small_enough, float(successes) / n, 1., rtol=1e-9)
+  low_p = optimize.brentq(
+      p_big_enough, 0., float(successes) / n, rtol=1e-9)
+  low_interval = low + (high - low) * low_p
+  high_interval = low + (high - low) * high_p
+  return (low_interval, high_interval)
+
+
+def compute_true_volumes(
+    det_bounds, dim, num_samples, error_rate=1e-6, seed=42):
+  """Returns confidence intervals for the desired correlation matrix volumes.
+
+  The confidence intervals are computed by the [Clopper-Pearson method]
+  (https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval).
+
+  Args:
+    det_bounds: A rank-1 numpy array of lower bounds on the
+      determinants of acceptable matrices.  Entries must be unique.
+    dim: A Python `int` dimension of correlation matrices to sample.
+    num_samples: The number of samples to draw.
+    error_rate: The statistical significance of the returned
+      confidence intervals.  The significance is broadcast: Each
+      returned interval separately may be incorrect with probability
+      (under the sample of correlation-like matrices drawn internally)
+      at most `error_rate`.
+    seed: Random seed.
+
+  Returns:
+    bounds: A Python `dict` mapping each determinant bound to the low, high
+      tuple giving the confidence interval.
+  """
+  bounds = {}
+  with session.Session() as sess:
+    rej_weights, _ = correlation_matrix_volume_rejection_samples(
+        det_bounds, dim, [num_samples, len(det_bounds)], np.float32, seed=seed)
+    rej_weights = sess.run(rej_weights)
+    for rw, det in zip(np.rollaxis(rej_weights, 1), det_bounds):
+      template = ("Estimating volume of {}x{} correlation "
+                  "matrices with determinant >= {}.")
+      print(template.format(dim, dim, det))
+      sys.stdout.flush()
+      bounds[det] = _clopper_pearson_confidence_interval(
+          rw, error_rate=error_rate)
+    return bounds
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py
new file mode 100644
index 0000000000..8f99300e63
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/util/correlation_matrix_volumes_test.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for correlation_matrix_volumes_lib.py."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.kernel_tests.util import correlation_matrix_volumes_lib as corr
+from tensorflow.contrib.distributions.python.ops import statistical_testing as st
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.platform import test
+
+
+# NxN correlation matrices are determined by the N*(N-1)/2
+# lower-triangular entries.  In addition to being between -1 and 1,
+# they must also obey the constraint that the determinant of the
+# resulting symmetric matrix is non-negative.  In 2x2, we can even
+# analytically compute the volume when the determinant is bounded to >
+# epsilon, as that boils down to the one lower-triangular entry being
+# less than 1 - epsilon in absolute value.
+def two_by_two_volume(det_bound):
+  return 2 * np.sqrt(1.0 - det_bound)
+
+
+# The post
+# https://psychometroscar.com/the-volume-of-a-3-x-3-correlation-matrix/
+# derives (with elementary calculus) that the volume (with respect to
+# Lebesgue^3 measure) of the set of 3x3 correlation matrices is
+# pi^2/2.  The same result is also obtained by [1].
+def three_by_three_volume():
+  return np.pi**2 / 2.
+
+
+# The volume of the unconstrained set of correlation matrices is also
+# the normalization constant of the LKJ distribution from [2].  As
+# part of defining the distribution, that reference a derives general
+# formula for this volume for all dimensions.  A TensorFlow
+# computation thereof gave the below result for 4x4:
+def four_by_four_volume():
+  # This constant computed as math_ops.exp(lkj.log_norm_const(4, [1.0]))
+  return 11.6973076
+
+# [1] Rousseeuw, P. J., & Molenberghs, G. (1994). "The shape of
+# correlation matrices." The American Statistician, 48(4), 276-279.
+
+# [2] Daniel Lewandowski, Dorota Kurowicka, and Harry Joe, "Generating
+# random correlation matrices based on vines and extended onion
+# method," Journal of Multivariate Analysis 100 (2009), pp 1989-2001.
+
+
+class CorrelationMatrixVolumesTest(test.TestCase):
+
+  def testRejection2D(self):
+    num_samples = int(1e5)  # Chosen for a small min detectable discrepancy
+    det_bounds = np.array(
+        [0.01, 0.02, 0.03, 0.04, 0.05, 0.3, 0.35, 0.4, 0.5], dtype=np.float32)
+    exact_volumes = two_by_two_volume(det_bounds)
+    (rej_weights,
+     rej_proposal_volume) = corr.correlation_matrix_volume_rejection_samples(
+         det_bounds, 2, [num_samples, 9], dtype=np.float32, seed=43)
+    # shape of rej_weights: [num_samples, 9, 2, 2]
+    chk1 = st.assert_true_mean_equal_by_dkwm(
+        rej_weights, low=0., high=rej_proposal_volume, expected=exact_volumes,
+        false_fail_rate=1e-6)
+    chk2 = check_ops.assert_less(
+        st.min_discrepancy_of_true_means_detectable_by_dkwm(
+            num_samples, low=0., high=rej_proposal_volume,
+            # Correct the false fail rate due to different broadcasting
+            false_fail_rate=1.1e-7, false_pass_rate=1e-6),
+        0.036)
+    with ops.control_dependencies([chk1, chk2]):
+      rej_weights = array_ops.identity(rej_weights)
+    self.evaluate(rej_weights)
+
+  def testRejection3D(self):
+    num_samples = int(1e5)  # Chosen for a small min detectable discrepancy
+    det_bounds = np.array([0.0], dtype=np.float32)
+    exact_volumes = np.array([three_by_three_volume()], dtype=np.float32)
+    (rej_weights,
+     rej_proposal_volume) = corr.correlation_matrix_volume_rejection_samples(
+         det_bounds, 3, [num_samples, 1], dtype=np.float32, seed=44)
+    # shape of rej_weights: [num_samples, 1, 3, 3]
+    chk1 = st.assert_true_mean_equal_by_dkwm(
+        rej_weights, low=0., high=rej_proposal_volume, expected=exact_volumes,
+        false_fail_rate=1e-6)
+    chk2 = check_ops.assert_less(
+        st.min_discrepancy_of_true_means_detectable_by_dkwm(
+            num_samples, low=0., high=rej_proposal_volume,
+            false_fail_rate=1e-6, false_pass_rate=1e-6),
+        # Going for about a 3% relative error
+        0.15)
+    with ops.control_dependencies([chk1, chk2]):
+      rej_weights = array_ops.identity(rej_weights)
+    self.evaluate(rej_weights)
+
+  def testRejection4D(self):
+    num_samples = int(1e5)  # Chosen for a small min detectable discrepancy
+    det_bounds = np.array([0.0], dtype=np.float32)
+    exact_volumes = [four_by_four_volume()]
+    (rej_weights,
+     rej_proposal_volume) = corr.correlation_matrix_volume_rejection_samples(
+         det_bounds, 4, [num_samples, 1], dtype=np.float32, seed=45)
+    # shape of rej_weights: [num_samples, 1, 4, 4]
+    chk1 = st.assert_true_mean_equal_by_dkwm(
+        rej_weights, low=0., high=rej_proposal_volume, expected=exact_volumes,
+        false_fail_rate=1e-6)
+    chk2 = check_ops.assert_less(
+        st.min_discrepancy_of_true_means_detectable_by_dkwm(
+            num_samples, low=0., high=rej_proposal_volume,
+            false_fail_rate=1e-6, false_pass_rate=1e-6),
+        # Going for about a 10% relative error
+        1.1)
+    with ops.control_dependencies([chk1, chk2]):
+      rej_weights = array_ops.identity(rej_weights)
+    self.evaluate(rej_weights)
+
+  def testVolumeEstimation2D(self):
+    # Test that the confidence intervals produced by
+    # corr.compte_true_volumes are sound, in the sense of containing
+    # the exact volume.
+    num_samples = int(1e5)  # Chosen by symmetry with testRejection2D
+    det_bounds = np.array(
+        [0.01, 0.02, 0.03, 0.04, 0.05, 0.3, 0.35, 0.4, 0.5], dtype=np.float32)
+    volume_bounds = corr.compute_true_volumes(
+        det_bounds, 2, num_samples, error_rate=1e-6, seed=47)
+    exact_volumes = two_by_two_volume(det_bounds)
+    for det, volume in zip(det_bounds, exact_volumes):
+      computed_low, computed_high = volume_bounds[det]
+      self.assertLess(computed_low, volume)
+      self.assertGreater(computed_high, volume)
+
+if __name__ == "__main__":
+  test.main()
-- 
GitLab


From 5f315a292a65bd898a736cd305152f348846718a Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Mon, 4 Jun 2018 11:11:06 -0700
Subject: [PATCH 260/610] Fix visibility for tf.keras.__version__

PiperOrigin-RevId: 199161696
---
 tensorflow/python/keras/__init__.py         | 4 ++++
 tensorflow/python/keras/integration_test.py | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/tensorflow/python/keras/__init__.py b/tensorflow/python/keras/__init__.py
index 197f306097..3493069a5b 100644
--- a/tensorflow/python/keras/__init__.py
+++ b/tensorflow/python/keras/__init__.py
@@ -41,8 +41,12 @@ from tensorflow.python.keras.layers import Input
 from tensorflow.python.keras.models import Model
 from tensorflow.python.keras.models import Sequential
 
+from tensorflow.python.util.tf_export import tf_export
+
 __version__ = '2.1.6-tf'
 
+tf_export('keras.__version__').export_constant(__name__, '__version__')
+
 del absolute_import
 del division
 del print_function
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 2e83544d97..2a05699407 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -29,6 +29,9 @@ from tensorflow.python.platform import test
 
 class KerasIntegrationTest(test.TestCase):
 
+  def test_version(self):
+    self.assertTrue(keras.__version__.endswith('-tf'))
+
   def test_vector_classification_sequential(self):
     with self.test_session():
       np.random.seed(1337)
-- 
GitLab


From add0043e9d6233d9fabf2676e449d26ecd257ec5 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Mon, 4 Jun 2018 11:25:24 -0700
Subject: [PATCH 261/610] - Fix typo in evaluator

PiperOrigin-RevId: 199164433
---
 tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index b1b58642ec..13f46407e3 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -1962,7 +1962,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
     // TODO(b/74360564): This is implementation defined behavior, but is
     // currently respected by all implementations. Change this if we ever decide
-    // to oficially document different behavior.
+    // to officially document different behavior.
     for (int64 i = 0; i < start.size(); ++i) {
       start[i] = std::min<int64>(
           std::max(int64{0}, start[i]),
-- 
GitLab


From afb0950cf4acf1ec920287066154cc1b21b2a7bf Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 4 Jun 2018 11:45:53 -0700
Subject: [PATCH 262/610] Add a special functions module that contains
 non-Python abstractions, like the list stack operation.

PiperOrigin-RevId: 199167953
---
 tensorflow/contrib/autograph/__init__.py      | 16 +++++-
 tensorflow/contrib/autograph/impl/BUILD       | 11 ++++
 .../autograph/impl/special_functions.py       | 48 ++++++++++++++++++
 .../autograph/impl/special_functions_test.py  | 50 +++++++++++++++++++
 4 files changed, 123 insertions(+), 2 deletions(-)
 create mode 100644 tensorflow/contrib/autograph/impl/special_functions.py
 create mode 100644 tensorflow/contrib/autograph/impl/special_functions_test.py

diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 3386c4eca4..310eb34a70 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -29,12 +29,24 @@ from tensorflow.contrib.autograph.impl.api import do_not_convert
 from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import to_code
 from tensorflow.contrib.autograph.impl.api import to_graph
+from tensorflow.contrib.autograph.impl.special_functions import stack
 from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'utils', 'convert', 'converted_call', 'do_not_convert', 'RunMode',
-    'to_code', 'to_graph', 'AutographParseError'
+    # Main API
+    'RunMode',
+    'convert',
+    'converted_call',
+    'do_not_convert',
+    'to_code',
+    'to_graph',
+    # Special functions
+    'stack',
+    # Exceptions
+    'AutographParseError',
+    # Utilities: to be removed
+    'utils',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/autograph/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
index 54424e2647..91ae0b9b82 100644
--- a/tensorflow/contrib/autograph/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -21,6 +21,7 @@ py_library(
         "config.py",
         "conversion.py",
         "naming.py",
+        "special_functions.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -69,3 +70,13 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "special_functions_test",
+    srcs = ["special_functions_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":impl",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/impl/special_functions.py b/tensorflow/contrib/autograph/impl/special_functions.py
new file mode 100644
index 0000000000..b7a8177c44
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/special_functions.py
@@ -0,0 +1,48 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Special functions that only make sense for AutoGraph.
+
+These functions are meant to ensure feature parity between Python and AutoGraph,
+so that the exact same code works in both modes. In general, AutoGraph will
+replace these calls.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.operators import data_structures
+
+
+def stack(list_or_tensor, element_dtype=None):
+  """Stacks the input, if it admits the notion of stacking. No-op otherwise.
+
+  For example, a list of tensors can be stacked into a larger tensor. This
+  function is similar to tf.stack, but it accepts non-lists and lists of
+  non-tensors as arguments. In the latter case, the function does nothing.
+
+  Args:
+    list_or_tensor: Any entity.
+    element_dtype: Optional dtype for the elements in the list. Required if the
+        input is stackable, and the list is untyped.
+
+  Returns:
+    If the input is stackable, a new object representing the stacked inputs.
+  Otherwise it returns list_or_tensor unchanged.
+  """
+  return data_structures.list_stack(
+      list_or_tensor,
+      data_structures.ListStackOpts(
+          element_dtype=element_dtype, original_call=lambda x: x))
diff --git a/tensorflow/contrib/autograph/impl/special_functions_test.py b/tensorflow/contrib/autograph/impl/special_functions_test.py
new file mode 100644
index 0000000000..9b52d2a59b
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/special_functions_test.py
@@ -0,0 +1,50 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for special_functions module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph.impl import special_functions
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import list_ops
+from tensorflow.python.platform import test
+
+
+class SpecialFunctionsTest(test.TestCase):
+
+  def test_basic(self):
+    self.assertEqual(special_functions.stack(1), 1)
+    self.assertListEqual(special_functions.stack([1, 2, 3]), [1, 2, 3])
+    # TODO(mdan): This should probably forward to tf.stack.
+    self.assertTrue(
+        isinstance(
+            special_functions.stack(
+                [constant_op.constant(1),
+                 constant_op.constant(2)]), list))
+
+    t = constant_op.constant([1.0, 2.0])
+    l = list_ops.tensor_list_from_tensor(
+        t, element_shape=constant_op.constant([], dtype=dtypes.int32))
+    self.assertTrue(
+        tensor_util.is_tensor(
+            special_functions.stack(l, element_dtype=dtypes.float32)))
+
+
+if __name__ == '__main__':
+  test.main()
-- 
GitLab


From 008fc03ab6ec74a3b9acca1b182e243c55da0956 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 4 Jun 2018 11:47:29 -0700
Subject: [PATCH 263/610] [TF:XLA] Bump open source llvm revision to r333878

PiperOrigin-RevId: 199168290
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index c072f89965..e66af3c8bc 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -452,11 +452,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/48c1879dcedb834e95a95da8715b30897a49edbe.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/48c1879dcedb834e95a95da8715b30897a49edbe.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f.tar.gz",
       ],
-      sha256 = "0e0767199c169f738718461d05d3fdada80b533a6e8e2e07c9ae852356be3c0a",
-      strip_prefix = "llvm-48c1879dcedb834e95a95da8715b30897a49edbe",
+      sha256 = "6f782a0d2e9d7946bdf20807e0fcd8f5eaed8afd93bdd610cdefbe9435ca551f",
+      strip_prefix = "llvm-40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From 836fc096c77a3b1170b91242e30b6075e7805cec Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 4 Jun 2018 12:05:14 -0700
Subject: [PATCH 264/610] Fix test user ops

PiperOrigin-RevId: 199171316
---
 tensorflow/tools/ci_build/builds/test_user_ops.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index c342367bac..25ecee4725 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -239,8 +239,9 @@ function run_op() {
   fi
 }
 
-run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
+run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")"
+run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+
 
 popd
 
-- 
GitLab


From d16877ce0372df0c1ff5b8046fbe8985cfb796f9 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 4 Jun 2018 12:08:15 -0700
Subject: [PATCH 265/610] Fix Python API.

PiperOrigin-RevId: 199171845
---
 tensorflow/contrib/lite/python/convert_saved_model.py    | 4 ++--
 .../contrib/lite/python/convert_saved_model_test.py      | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index b952a72aab..5dad49f1ed 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -216,9 +216,9 @@ def set_tensor_shapes(tensors, shapes):
   """
   if shapes:
     for tensor in tensors:
-      shape = shapes.get(tensor.name)
+      shape = shapes.get(tensor_name(tensor))
       if shape is not None:
-        tensor.set_shape(shapes[tensor.name])
+        tensor.set_shape(shape)
 
 
 def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index 80e5dc6e46..1e570d2c89 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -73,10 +73,15 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-    convert_saved_model.set_tensor_shapes([tensor],
-                                          {"Placeholder:0": [5, 3, 5]})
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
     self.assertEqual([5, 3, 5], tensor.shape.as_list())
 
+  def testSetTensorShapeNoneValid(self):
+    tensor = array_ops.placeholder(dtype=dtypes.float32)
+
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
+    self.assertEqual([1, 3, 5], tensor.shape.as_list())
+
   def testSetTensorShapeInvalid(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
-- 
GitLab


From d88e8719833b409042c03d20a9a4acaac1d1f531 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 12:15:47 -0700
Subject: [PATCH 266/610] added clearer description for invalid behavior when
 executing in eager mode.

PiperOrigin-RevId: 199173022
---
 tensorflow/python/keras/engine/input_layer.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index b04dc3c60b..7996110829 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -119,6 +119,12 @@ class InputLayer(base_layer.Layer):
       self.is_placeholder = False
       self._batch_input_shape = tuple(input_tensor.get_shape().as_list())
 
+      if context.executing_eagerly():
+        raise ValueError('You should not pass an input tensor when executing '
+                         'in eager mode. For example, instead of creating an '
+                         'InputLayer, you should instantiate your model and '
+                         'directly call it on your input.')
+
     # Create an input node to add to self.outbound_node
     # and set output_tensors' _keras_history.
     input_tensor._keras_history = (self, 0, 0)  # pylint: disable=protected-access
-- 
GitLab


From 48acc50c8d5ddf641e5fe0f8f3b27c9085854edd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 12:42:39 -0700
Subject: [PATCH 267/610] Turns on optimization to convert division of sqrt to
 multiplication of rsqrt

PiperOrigin-RevId: 199177029
---
 tensorflow/core/grappler/optimizers/arithmetic_optimizer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index ce3c633baf..e6fc311929 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -59,7 +59,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool enable_try_simplify_and_replace = true;
 
     bool combine_add_to_addn = true;
-    bool convert_sqrt_div_to_rsqrt_mul = false;
+    bool convert_sqrt_div_to_rsqrt_mul = true;
     bool dedup_computations = true;
     bool fold_multiply_into_conv = true;
     bool hoist_common_factor_out_of_aggregation = true;
-- 
GitLab


From 8c7a504699f35fb5252640d7319fe516ff0a19a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 12:57:33 -0700
Subject: [PATCH 268/610] Fix a couple of doc typos.

PiperOrigin-RevId: 199179067
---
 .../api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt   | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index 41a9cfaa27..9b500d0b58 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -44,6 +44,7 @@ END
   summary: "Quantizes then dequantizes a tensor."
   description: <<END
 This op simulates the precision loss from the quantized forward pass by:
+
 1. Quantizing the tensor to fixed point numbers, which should match the target
    quantization method when it is used in inference.
 2. Dequantizing it back to floating point numbers for the following ops, most
@@ -85,9 +86,9 @@ e.g.
     10.0]: it would use a scale_factor of 127 / 10.0 = 12.7 In this case, it
     would update input_min to be 128.0 / 12.7 = -10.07874
 *   if the output is unsigned, input_min is forced to be 0, and only the
-    specifide input_max is used.
+    specified input_max is used.
 
-After determining the scale_factor and updating the input tange, it applies the
+After determining the scale_factor and updating the input range, it applies the
 following to each value in the 'input' tensor.
 
 output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
-- 
GitLab


From d1c2dbd99c046b6258fd9a8637df8abf1101122f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 13:01:31 -0700
Subject: [PATCH 269/610] Fix broken distributed_runtime/remote_device_test by
 adding missing std::shared_ptr.

PiperOrigin-RevId: 199179607
---
 tensorflow/core/distributed_runtime/remote_device_test.cc | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/remote_device_test.cc b/tensorflow/core/distributed_runtime/remote_device_test.cc
index 778060daaf..a04e79328b 100644
--- a/tensorflow/core/distributed_runtime/remote_device_test.cc
+++ b/tensorflow/core/distributed_runtime/remote_device_test.cc
@@ -49,8 +49,9 @@ class RemoteDeviceTest : public ::testing::Test {
     TF_CHECK_OK(spec.AddHostPortsJob("localhost", {hostport}));
     ChannelCreationFunction channel_func =
         ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
-    worker_cache_.reset(
-        NewGrpcWorkerCache(NewGrpcChannelCache(spec, channel_func)));
+    std::shared_ptr<GrpcChannelCache> channel_cache(
+        NewGrpcChannelCache(spec, channel_func));
+    worker_cache_.reset(NewGrpcWorkerCache(channel_cache));
     remote_name_ = "/job:localhost/replica:0/task:0";
     wi_ = worker_cache_->CreateWorker(remote_name_);
   }
-- 
GitLab


From 06a7049f29b0148659693ec53db530c2c895a6a6 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 4 Jun 2018 13:23:40 -0700
Subject: [PATCH 270/610] I've made the updates Rajat requested. Please note
 the links will not work until after we have launched.

---
 RELEASE.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/RELEASE.md b/RELEASE.md
index 600294478d..c1ed69bd45 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -4,8 +4,10 @@
 * Update tf.keras to the Keras 2.1.6 API.
 * `tfe.Network` is deprecated. Please inherit from `tf.keras.Model`.
 * Adding support of core feature columns and losses to gradient boosted trees estimators.
-* The Bijector API now requires 'event_ndims' passed in to the `log_det_jacobian` methods, while `event_ndims` is removed from the base class and replaced with `forward_min_event_ndims`. The signature is now `log_det_jacobian(x, event_ndims)`. The main rationale for this change is that it allows Bijectors to broadcast.
-RELNOTES: If you were using layers from `tf.keras.layers` in conjunction with custom variable scopes, your layer variable names might have changed. If you were using layers from `tf.layers` in a subclassed `tf.keras.Model` class, then your variable names have changed (you can restore the prior names by importing the same layers from `tf.keras.layers` instead of `tf.layers`).
+* The distributions.Bijector API supports broadcasting for Bijectors with new API changes. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/distributions/bijectors/Bijector) for more details.
+* Layered variable names have changed in the following conditions:
+  * Using `tf.keras.layers` with custom variable scopes.
+  * Using `tf.layers` in  a subclassed `tf.keras.Model` class. See [here](https://www.tensorflow.org/versions/r1.9/api_docs/python/tf/layers) for more details
 
 ## Breaking Chances
   * If you're opening empty variable scopes; replace `variable_scope`('', ...) by `variable_scope`(`tf.get_variable_scope()`, ...).
-- 
GitLab


From 279b899642c22734a5bd3b375a2fa9f84aa4738c Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 4 Jun 2018 13:42:17 -0700
Subject: [PATCH 271/610] Improve TOCO error handling.

PiperOrigin-RevId: 199186109
---
 .../lite/python/convert_saved_model_test.py    |  1 +
 tensorflow/contrib/lite/python/lite.py         |  6 +++++-
 tensorflow/contrib/lite/python/lite_test.py    | 18 ++++++++++++++----
 3 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index 1e570d2c89..92c4ebb246 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -78,6 +78,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
   def testSetTensorShapeNoneValid(self):
     tensor = array_ops.placeholder(dtype=dtypes.float32)
+    self.assertEqual(None, tensor.shape)
 
     convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
     self.assertEqual([1, 3, 5], tensor.shape.as_list())
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 253b5eadf3..2cb06e2559 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -254,15 +254,19 @@ class TocoConverter(object):
 
     Raises:
       ValueError:
+        Input shape is not specified.
         None value for dimension in input_tensor.
     """
     # Checks dimensions in input tensor.
     for tensor in self._input_tensors:
+      if not tensor.get_shape():
+        raise ValueError("Provide an input shape for input array '{0}'.".format(
+            tensor_name(tensor)))
       shape = tensor.get_shape().as_list()
       if None in shape[1:]:
         raise ValueError(
             "None is only supported in the 1st dimension. Tensor '{0}' has "
-            "invalid shape '{1}'.".format(tensor.name, shape))
+            "invalid shape '{1}'.".format(tensor_name(tensor), shape))
       elif shape[0] is None:
         self._set_batch_size(batch_size=1)
 
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 53d1878293..5f8dfc0dc1 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -131,21 +131,31 @@ class FromSessionTest(test_util.TensorFlowTestCase):
         'Quantization input stats are not available for input tensors '
         '\'inputB\'.', str(error.exception))
 
-  def testBatchSizeInvalid(self):
-    in_tensor = array_ops.placeholder(
-        shape=[None, 16, 16, 3], dtype=dtypes.float32)
+  def testSizeNoneInvalid(self):
+    in_tensor = array_ops.placeholder(dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
 
     # Test invalid shape. None after 1st dimension.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
+                     str(error.exception))
+
+  def testBatchSizeInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, None, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Test invalid shape. None after 1st dimension.
     converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
     with self.assertRaises(ValueError) as error:
       converter.convert()
     self.assertEqual(
         'None is only supported in the 1st dimension. Tensor '
-        '\'Placeholder_1:0\' has invalid shape \'[1, None, 16, 3]\'.',
+        '\'Placeholder\' has invalid shape \'[1, None, 16, 3]\'.',
         str(error.exception))
 
   def testBatchSizeValid(self):
-- 
GitLab


From 204fcd9a002aa8678c42d076553e38d69e8724a6 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Mon, 4 Jun 2018 14:20:46 -0700
Subject: [PATCH 272/610] [XLA:GPU] Propagate layouts in a better order for
 performance and fusion.

PiperOrigin-RevId: 199193181
---
 .../compiler/xla/service/gpu/gpu_layout_assignment.cc     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 178457721a..8bf62dde8b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -159,7 +159,13 @@ Status GpuLayoutAssignment::AddBackendConstraintsToDnnConvCustomCall(
 
 Status GpuLayoutAssignment::AddBackendConstraints(
     LayoutConstraints* constraints) {
-  for (auto* instruction : constraints->computation()->instructions()) {
+  // Add convolution constraints in reverse postorder that the earliest
+  // convolution layout propagates first. This reduces the likelihood of fusion
+  // nodes with copies.
+  auto post_order = constraints->computation()->MakeInstructionPostOrder();
+  for (auto iterator = post_order.rbegin(); iterator != post_order.rend();
+       ++iterator) {
+    HloInstruction* instruction = *iterator;
     if (IsCustomCallToDnnConvolution(*instruction)) {
       TF_RETURN_IF_ERROR(
           AddBackendConstraintsToDnnConvCustomCall(instruction, constraints));
-- 
GitLab


From 3c87b99d8c8052c3b6d67190bca14ea89137221a Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 4 Jun 2018 14:26:09 -0700
Subject: [PATCH 273/610] Remove --distinct_host_configuration=false from
 tools/bazel.rc

Don't use --distinct_host_configuration=false by default, because it would break cross compiling, like android build and Raspberry Pi build.

Instead, we add it for builds that we know they have the same host and target platforms.

PiperOrigin-RevId: 199194260
---
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh          | 1 -
 .../tools/ci_build/windows/cpu/pip/build_tf_windows.sh      | 4 ++++
 tools/bazel.rc                                              | 6 ------
 3 files changed, 4 insertions(+), 7 deletions(-)

diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 30ea8539aa..1bd1852ffc 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -100,7 +100,6 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
-  --distinct_host_configuration=true \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 1b1c3815d8..0b13b97209 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -73,6 +73,10 @@ if [[ "$release_build" != 1 ]]; then
   echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
 fi
 
+# The host and target platforms are the same in Windows build. So we don't have
+# to distinct them. This helps avoid building the same targets twice.
+echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
+
 echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
 
 run_configure_for_cpu_build
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 03aa52da1f..1c1e6afb65 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -1,14 +1,8 @@
-# By default, we don't distinct target and host platfroms.
-# When doing cross compilation, use --config=cross_compile to distinct them.
-build --distinct_host_configuration=false
-build:cross_compile --distinct_host_configuration=true
-
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
 # target CPU to build transient dependencies correctly. See
 # https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:android --config=cross_compile
 build:android_arm --config=android
 build:android_arm --cpu=armeabi-v7a
 build:android_arm --fat_apk_cpu=armeabi-v7a
-- 
GitLab


From 6b2a088fb263af2428ca672a62088646a7f54219 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Mon, 4 Jun 2018 14:46:38 -0700
Subject: [PATCH 274/610] Add various missing aliases for symbols in tf.keras
 submodules.

PiperOrigin-RevId: 199198086
---
 tensorflow/python/keras/losses.py             | 35 ++++++++++++---
 tensorflow/python/ops/init_ops.py             | 21 +++++----
 ...nsorflow.keras.initializers.constant.pbtxt | 18 ++++++++
 ...nsorflow.keras.initializers.identity.pbtxt | 18 ++++++++
 ...tensorflow.keras.initializers.normal.pbtxt | 18 ++++++++
 .../tensorflow.keras.initializers.ones.pbtxt  | 18 ++++++++
 ...orflow.keras.initializers.orthogonal.pbtxt | 18 ++++++++
 .../tensorflow.keras.initializers.pbtxt       | 40 +++++++++++++++++
 ...low.keras.initializers.random_normal.pbtxt | 18 ++++++++
 ...ow.keras.initializers.random_uniform.pbtxt | 18 ++++++++
 ....keras.initializers.truncated_normal.pbtxt | 18 ++++++++
 ...ensorflow.keras.initializers.uniform.pbtxt | 18 ++++++++
 .../tensorflow.keras.initializers.zeros.pbtxt | 18 ++++++++
 .../api/golden/tensorflow.keras.losses.pbtxt  | 44 +++++++++++++++++++
 .../api/golden/tensorflow.keras.metrics.pbtxt | 44 +++++++++++++++++++
 15 files changed, 350 insertions(+), 14 deletions(-)
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
 create mode 100644 tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt

diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index d82ebd9c31..9f548bfe04 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -30,19 +30,31 @@ from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export('keras.metrics.mean_squared_error',
-           'keras.losses.mean_squared_error')
+           'keras.metrics.mse',
+           'keras.metrics.MSE',
+           'keras.losses.mean_squared_error',
+           'keras.losses.mse',
+           'keras.losses.MSE')
 def mean_squared_error(y_true, y_pred):
   return K.mean(math_ops.square(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_error',
-           'keras.losses.mean_absolute_error')
+           'keras.metrics.mae',
+           'keras.metrics.MAE',
+           'keras.losses.mean_absolute_error',
+           'keras.losses.mae',
+           'keras.losses.MAE')
 def mean_absolute_error(y_true, y_pred):
   return K.mean(math_ops.abs(y_pred - y_true), axis=-1)
 
 
 @tf_export('keras.metrics.mean_absolute_percentage_error',
-           'keras.losses.mean_absolute_percentage_error')
+           'keras.metrics.mape',
+           'keras.metrics.MAPE',
+           'keras.losses.mean_absolute_percentage_error',
+           'keras.losses.mape',
+           'keras.losses.MAPE')
 def mean_absolute_percentage_error(y_true, y_pred):
   diff = math_ops.abs(
       (y_true - y_pred) / K.clip(math_ops.abs(y_true), K.epsilon(), None))
@@ -50,7 +62,11 @@ def mean_absolute_percentage_error(y_true, y_pred):
 
 
 @tf_export('keras.metrics.mean_squared_logarithmic_error',
-           'keras.losses.mean_squared_logarithmic_error')
+           'keras.metrics.msle',
+           'keras.metrics.MSLE',
+           'keras.losses.mean_squared_logarithmic_error',
+           'keras.losses.msle',
+           'keras.losses.MSLE')
 def mean_squared_logarithmic_error(y_true, y_pred):
   first_log = math_ops.log(K.clip(y_pred, K.epsilon(), None) + 1.)
   second_log = math_ops.log(K.clip(y_true, K.epsilon(), None) + 1.)
@@ -117,7 +133,11 @@ def binary_crossentropy(y_true, y_pred):
 
 
 @tf_export('keras.metrics.kullback_leibler_divergence',
-           'keras.losses.kullback_leibler_divergence')
+           'keras.metrics.kld',
+           'keras.metrics.KLD',
+           'keras.losses.kullback_leibler_divergence',
+           'keras.losses.kld',
+           'keras.losses.KLD')
 def kullback_leibler_divergence(y_true, y_pred):
   y_true = K.clip(y_true, K.epsilon(), 1)
   y_pred = K.clip(y_pred, K.epsilon(), 1)
@@ -129,7 +149,10 @@ def poisson(y_true, y_pred):
   return K.mean(y_pred - y_true * math_ops.log(y_pred + K.epsilon()), axis=-1)
 
 
-@tf_export('keras.metrics.cosine_proximity', 'keras.losses.cosine_proximity')
+@tf_export('keras.metrics.cosine_proximity',
+           'keras.metrics.cosine',
+           'keras.losses.cosine_proximity',
+           'keras.losses.cosine')
 def cosine_proximity(y_true, y_pred):
   y_true = nn.l2_normalize(y_true, axis=-1)
   y_pred = nn.l2_normalize(y_pred, axis=-1)
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 1f8d8dc4f3..2df230d470 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -86,7 +86,7 @@ class Initializer(object):
 
 
 @tf_export("keras.initializers.Zeros", "initializers.zeros",
-           "zeros_initializer")
+           "zeros_initializer", "keras.initializers.zeros")
 class Zeros(Initializer):
   """Initializer that generates tensors initialized to 0."""
 
@@ -102,7 +102,8 @@ class Zeros(Initializer):
     return {"dtype": self.dtype.name}
 
 
-@tf_export("keras.initializers.Ones", "initializers.ones", "ones_initializer")
+@tf_export("keras.initializers.Ones", "initializers.ones", "ones_initializer",
+           "keras.initializers.ones")
 class Ones(Initializer):
   """Initializer that generates tensors initialized to 1."""
 
@@ -119,7 +120,7 @@ class Ones(Initializer):
 
 
 @tf_export("keras.initializers.Constant", "initializers.constant",
-           "constant_initializer")
+           "constant_initializer", "keras.initializers.constant")
 class Constant(Initializer):
   """Initializer that generates tensors with constant values.
 
@@ -225,7 +226,8 @@ class Constant(Initializer):
 
 
 @tf_export("keras.initializers.RandomUniform", "initializers.random_uniform",
-           "random_uniform_initializer")
+           "random_uniform_initializer", "keras.initializers.uniform",
+           "keras.initializers.random_uniform")
 class RandomUniform(Initializer):
   """Initializer that generates tensors with a uniform distribution.
 
@@ -262,7 +264,8 @@ class RandomUniform(Initializer):
 
 
 @tf_export("keras.initializers.RandomNormal", "initializers.random_normal",
-           "random_normal_initializer")
+           "random_normal_initializer", "keras.initializers.normal",
+           "keras.initializers.random_normal")
 class RandomNormal(Initializer):
   """Initializer that generates tensors with a normal distribution.
 
@@ -299,7 +302,8 @@ class RandomNormal(Initializer):
 
 
 @tf_export("keras.initializers.TruncatedNormal",
-           "initializers.truncated_normal", "truncated_normal_initializer")
+           "initializers.truncated_normal", "truncated_normal_initializer",
+           "keras.initializers.truncated_normal")
 class TruncatedNormal(Initializer):
   """Initializer that generates a truncated normal distribution.
 
@@ -482,7 +486,7 @@ class VarianceScaling(Initializer):
 
 
 @tf_export("keras.initializers.Orthogonal", "initializers.orthogonal",
-           "orthogonal_initializer")
+           "orthogonal_initializer", "keras.initializers.orthogonal")
 class Orthogonal(Initializer):
   """Initializer that generates an orthogonal matrix.
 
@@ -1062,7 +1066,8 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
     return self._dict_to_tensor(p, ksize, ksize, ksize)
 
 
-@tf_export("keras.initializers.Identity", "initializers.identity")
+@tf_export("keras.initializers.Identity", "initializers.identity",
+           "keras.initializers.identity")
 class Identity(Initializer):
   """Initializer that generates the identity matrix.
 
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
new file mode 100644
index 0000000000..bddc37b907
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.constant.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.constant"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Constant\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value\', \'dtype\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'float32\'>\", \'False\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
new file mode 100644
index 0000000000..a4c5a61490
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.identity.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.identity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Identity\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
new file mode 100644
index 0000000000..7485772784
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
new file mode 100644
index 0000000000..a89f78d1e1
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.ones.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.ones"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Ones\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
new file mode 100644
index 0000000000..ee1e9bbae2
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.orthogonal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.orthogonal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Orthogonal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'gain\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
index 093c56595b..14a667870d 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.pbtxt
@@ -40,6 +40,46 @@ tf_module {
     name: "Zeros"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "constant"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "identity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ones"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "orthogonal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "random_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "random_uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "truncated_normal"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "uniform"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "zeros"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "deserialize"
     argspec: "args=[\'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
new file mode 100644
index 0000000000..a6df1e87a3
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.random_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
new file mode 100644
index 0000000000..37a0fa0d55
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.random_uniform.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.random_uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
new file mode 100644
index 0000000000..f97e93f0b7
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.truncated_normal.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.truncated_normal"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.TruncatedNormal\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'mean\', \'stddev\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
new file mode 100644
index 0000000000..58186b1383
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.uniform.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.uniform"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.RandomUniform\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'minval\', \'maxval\', \'seed\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'None\', \'None\', \"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt
new file mode 100644
index 0000000000..a262390687
--- /dev/null
+++ b/tensorflow/tools/api/golden/tensorflow.keras.initializers.zeros.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.keras.initializers.zeros"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Zeros\'>"
+  is_instance: "<class \'tensorflow.python.ops.init_ops.Initializer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\"], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
index ae5f6305b7..eca6b91538 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,25 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "binary_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -12,6 +32,10 @@ tf_module {
     name: "categorical_hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cosine"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "cosine_proximity"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -28,6 +52,10 @@ tf_module {
     name: "hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -36,6 +64,14 @@ tf_module {
     name: "logcosh"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mean_absolute_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -52,6 +88,14 @@ tf_module {
     name: "mean_squared_logarithmic_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "poisson"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
index 42729e4237..a97a9b5758 100644
--- a/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,25 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member_method {
+    name: "KLD"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MAPE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "MSLE"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "binary_accuracy"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -16,6 +36,10 @@ tf_module {
     name: "categorical_crossentropy"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cosine"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "cosine_proximity"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -32,10 +56,22 @@ tf_module {
     name: "hinge"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "kld"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "kullback_leibler_divergence"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mae"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mape"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "mean_absolute_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -52,6 +88,14 @@ tf_module {
     name: "mean_squared_logarithmic_error"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "mse"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "msle"
+    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "poisson"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
-- 
GitLab


From 06c4fb61f269e18ca2f4b9a73d1b92e48bd095bf Mon Sep 17 00:00:00 2001
From: Vinu Rajashekhar <vinuraja@google.com>
Date: Mon, 4 Jun 2018 14:48:32 -0700
Subject: [PATCH 275/610] Fixes a cleanup bug in BatchFunction op.

PiperOrigin-RevId: 199198413
---
 .../batching/python/ops/batch_ops_test.py     | 28 +++++++++++++-
 tensorflow/core/kernels/batch_kernels.cc      | 37 +++++++++++--------
 2 files changed, 48 insertions(+), 17 deletions(-)

diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index 68e8a88ca0..ea8339334f 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -24,6 +24,7 @@ import time
 from tensorflow.contrib.batching.python.ops import batch_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
+from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_batch_ops
 from tensorflow.python.ops import gradients_impl
@@ -208,7 +209,7 @@ class BatchOpsTest(test.TestCase):
       self.assertEqual(main_results[0], [3])
 
   def testBatchFunctionOp(self):
-    """Tests that the batch_func works."""
+    """Tests that the batch_function op works."""
     with self.test_session() as sess:
 
       @function.Defun(dtypes.int32)
@@ -237,7 +238,7 @@ class BatchOpsTest(test.TestCase):
       self.assertEqual(main_results[0], [3])
 
   def testBatchFunctionOpWithCapturedInput(self):
-    """Tests that batch_func with timeout."""
+    """Tests that batch_function op works with captured input."""
     with self.test_session() as sess:
       captured_inp0 = array_ops.placeholder_with_default(2, shape=[])
       captured_inp1 = array_ops.placeholder_with_default(1, shape=[])
@@ -270,6 +271,29 @@ class BatchOpsTest(test.TestCase):
       self.assertEqual(thread_results[0], [2])
       self.assertEqual(main_results[0], [3])
 
+  def testBatchFunctionOpWithInputError(self):
+    """Tests that batch_function op works with error in the inputs."""
+    with self.test_session() as sess:
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+
+      @function.Defun(dtypes.int32, dtypes.int32)
+      def computation(in0, in1):
+        return in0 + in1
+
+      result = gen_batch_ops.batch_function(
+          [inp],  # computation actually expects 2 inputs.
+          num_batch_threads=1,
+          max_batch_size=10,
+          batch_timeout_micros=100000,  # 100ms
+          batching_queue="",
+          f=computation,
+          captured_tensors=computation.captured_inputs,
+          Tout=[o.type for o in computation.definition.signature.output_arg])
+
+      with self.assertRaisesRegexp(InvalidArgumentError,
+                                   ".*2 arguments.*but 1.*"):
+        sess.run([result], feed_dict={inp: [2]})
+
   def testBasicUnbatchDecoratedWithReshape(self):
     """Tests that the batch_function decorator works."""
     with self.test_session() as sess:
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index c0eef229ce..35ddda0ec0 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -523,21 +523,28 @@ class BatchResource : public ResourceBase {
     const auto& captured_inputs =
         batch->task(batch->num_tasks() - 1).captured_inputs;
     args.insert(args.end(), captured_inputs.begin(), captured_inputs.end());
-    flib->Run(opts, fhandle_, args, &combined_outputs,
-              [&](const Status& run_status) {
-                if (!run_status.ok()) {
-                  return;
-                }
-                const auto split_status =
-                    SplitOutputTensors(combined_outputs, batch.get());
-                // We do the cleanup here as an optimization, so that it runs in
-                // the underlying TF inter-op threadpool. Running it in the
-                // threadpool, let's the ensuing ops be scheduled faster,
-                // because the executor will add them to the front of the
-                // threadpool's task queue rather than the end.
-                cleanup_fn(split_status);
-                done.Notify();
-              });
+
+    // Releases the cleanup method here, because the callback of the function
+    // library runtime will handle it now.
+    finally.release();
+    flib->Run(
+        opts, fhandle_, args, &combined_outputs, [&](const Status& run_status) {
+          Status final_status;
+          auto run_finally = gtl::MakeCleanup([&]() {
+            // We do the cleanup here as an optimization, so that it runs in
+            // the underlying TF inter-op threadpool. Running it in the
+            // threadpool, let's the ensuing ops be scheduled faster,
+            // because the executor will add them to the front of the
+            // threadpool's task queue rather than the end.
+            cleanup_fn(final_status);
+            done.Notify();
+          });
+          final_status = run_status;
+          if (!final_status.ok()) {
+            return;
+          }
+          final_status = SplitOutputTensors(combined_outputs, batch.get());
+        });
     // By waiting for the notification we are ensuring that this thread isn't
     // used for processing other batches, which gives the batches time to
     // coalesce upstream. So overall the number of batches going through the
-- 
GitLab


From 142ccf3666e07d011aa83fdd6be8c17f721fbc99 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 14:52:29 -0700
Subject: [PATCH 276/610] Add rip-offs of LLVM's cast, dyn_cast, cast_or_null,
 dyn_cast_or_null in preparation to split HloInstruction into subclasses. This
 initial implementation uses C++ dynamic_cast, so it also adds vtable to
 HloInstruction.

PiperOrigin-RevId: 199199109
---
 tensorflow/compiler/xla/service/BUILD         |  16 +++
 .../compiler/xla/service/hlo_casting_utils.h  | 101 ++++++++++++++++
 .../xla/service/hlo_casting_utils_test.cc     | 112 ++++++++++++++++++
 .../compiler/xla/service/hlo_instruction.h    |  11 +-
 4 files changed, 235 insertions(+), 5 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_casting_utils.h
 create mode 100644 tensorflow/compiler/xla/service/hlo_casting_utils_test.cc

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 0102e4f003..c5b637419c 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -3020,3 +3020,19 @@ cc_library(
         "//tensorflow/core:regexp_internal",
     ],
 )
+
+cc_library(
+    name = "hlo_casting_utils",
+    hdrs = ["hlo_casting_utils.h"],
+    deps = [":hlo"],
+)
+
+tf_cc_test(
+    name = "hlo_casting_utils_test",
+    srcs = ["hlo_casting_utils_test.cc"],
+    deps = [
+        ":hlo_casting_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils.h b/tensorflow/compiler/xla/service/hlo_casting_utils.h
new file mode 100644
index 0000000000..b15f1f24c6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils.h
@@ -0,0 +1,101 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Casting utilitiy functions for HLO instructions.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+
+template <class T>
+using EnableIfDerivedFromHlo =
+    typename std::enable_if<std::is_base_of<HloInstruction, T>::value>::type;
+
+// TODO(b/93238915): Switch implementation from C++'s dynamic_cast to LLVM-like
+// RTTI if it turns out to be a performance issue.
+// Casts an HloInstruction pointer to one of its subclasses, dies if argument is
+// nullptr or runtime information does not match.
+//
+// Similar to LLVM's cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* Cast(const HloInstruction* instruction) {
+  CHECK(instruction != nullptr);
+  const T* casted = dynamic_cast<const T*>(instruction);
+  CHECK(casted != nullptr);
+  return casted;
+}
+
+// Non-const overload of Cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* Cast(HloInstruction* instruction) {
+  return const_cast<T*>(
+      Cast<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Works just like the Cast, except that it allows for a null pointer as an
+// argument which it then propagates.
+//
+// Similar to LLVM's cast_or_null.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* CastOrNull(const HloInstruction* instruction) {
+  return instruction != nullptr ? Cast<T>(instruction) : nullptr;
+}
+
+// Non-const overload of CastOrNull.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* CastOrNull(HloInstruction* instruction) {
+  return const_cast<T*>(
+      CastOrNull<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Casts an HloInstruction pointer to one of its subclasses, dies if argument is
+// nullptr, returns nullptr if runtime information does not match.
+//
+// Similar to LLVM's dyn_cast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* DynCast(const HloInstruction* instruction) {
+  CHECK(instruction != nullptr);
+  return dynamic_cast<const T*>(instruction);
+}
+
+// Non-const overload of DynCast.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* DynCast(HloInstruction* instruction) {
+  return const_cast<T*>(
+      DynCast<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+// Works just like the DynCast, except that it allows for a null pointer as an
+// argument which it then propagates.
+//
+// Similar to LLVM's dyn_cast_or_null.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+const T* DynCastOrNull(const HloInstruction* instruction) {
+  return instruction != nullptr ? DynCast<T>(instruction) : nullptr;
+}
+
+// Non-const overload of DynCastOrNull.
+template <class T, EnableIfDerivedFromHlo<T>* = nullptr>
+T* DynCastOrNull(HloInstruction* instruction) {
+  return const_cast<T*>(
+      DynCastOrNull<T>(const_cast<const HloInstruction*>(instruction)));
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
new file mode 100644
index 0000000000..436a922234
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+class DummyInstruction : public HloInstruction {
+ public:
+  DummyInstruction()
+      : HloInstruction(HloOpcode::kConstant, ShapeUtil::MakeShape(F32, {})) {}
+};
+
+class AnotherDummyInstruction : public HloInstruction {
+ public:
+  AnotherDummyInstruction()
+      : HloInstruction(HloOpcode::kParameter, ShapeUtil::MakeShape(F32, {})) {}
+};
+
+TEST(HloCastingUtilsTest, CastSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted =
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, CastDiesForWrongType) {
+  AnotherDummyInstruction instruction;
+  ASSERT_DEATH(
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction)), "");
+}
+
+TEST(HloCastingUtilsTest, CastDiesForNullptr) {
+  HloInstruction* null = nullptr;
+  ASSERT_DEATH(Cast<DummyInstruction>(null), "");
+}
+
+TEST(HloCastingUtilsTest, CastOrNullSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted =
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, CastOrNullDiesForWrongType) {
+  AnotherDummyInstruction instruction;
+  ASSERT_DEATH(
+      Cast<DummyInstruction>(static_cast<HloInstruction*>(&instruction)), "");
+}
+
+TEST(HloCastingUtilsTest, CastOrNullReturnsNullptrForNullptr) {
+  HloInstruction* null = nullptr;
+  DummyInstruction* casted = CastOrNull<DummyInstruction>(null);
+  ASSERT_EQ(casted, nullptr);
+}
+
+TEST(HloCastingUtilsTest, DynCastSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted =
+      DynCast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, DynCastReturnsNullptrForWrongType) {
+  AnotherDummyInstruction instruction;
+  DummyInstruction* casted =
+      DynCast<DummyInstruction>(static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, nullptr);
+}
+
+TEST(HloCastingUtilsTest, DynCastDiesForNullptr) {
+  HloInstruction* null = nullptr;
+  ASSERT_DEATH(DynCast<DummyInstruction>(null), "");
+}
+
+TEST(HloCastingUtilsTest, DynCastOrNullSucceeds) {
+  DummyInstruction instruction;
+  DummyInstruction* casted = DynCastOrNull<DummyInstruction>(
+      static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, &instruction);
+}
+
+TEST(HloCastingUtilsTest, DynCastOrNullReturnsNullptrForWrongType) {
+  AnotherDummyInstruction instruction;
+  DummyInstruction* casted = DynCastOrNull<DummyInstruction>(
+      static_cast<HloInstruction*>(&instruction));
+  ASSERT_EQ(casted, nullptr);
+}
+
+TEST(HloCastingUtilsTest, DynCastOrNullReturnsNullptrForNullptr) {
+  HloInstruction* null = nullptr;
+  DummyInstruction* casted = DynCastOrNull<DummyInstruction>(null);
+  ASSERT_EQ(casted, nullptr);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index d47af6c018..905ea5310d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -322,7 +322,7 @@ class HloInstruction {
     kCustom,
   };
 
-  ~HloInstruction();
+  virtual ~HloInstruction();
 
   // Creates an instruction from the given proto. Arguments:
   //
@@ -1515,6 +1515,11 @@ class HloInstruction {
   void RelayoutConstant(const Layout& new_layout,
                         const ShapeIndex& shape_index = {});
 
+ protected:
+  // Internal constructor for a given opcode/shape, other fields must be filled
+  // by factory methods.
+  HloInstruction(HloOpcode opcode, const Shape& shape);
+
  private:
   // Prints an instruction to a string.
   //
@@ -1560,10 +1565,6 @@ class HloInstruction {
   // Removes a user for this instruction.
   void RemoveUser(HloInstruction* user);
 
-  // Internal constructor for a given opcode/shape, other fields must be filled
-  // by factory methods.
-  HloInstruction(HloOpcode opcode, const Shape& shape);
-
   // Fuses the given instruction into this fusion instruction. When add_output
   // is false (which is the default), instruction_to_fuse is cloned and the
   // clone is placed in the fusion instruction. instruction_to_fuse is
-- 
GitLab


From e2d300823f410823b1b5fee4e5159a754247e219 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Mon, 4 Jun 2018 15:00:11 -0700
Subject: [PATCH 277/610] Move benchmarking code to a new directory and add
 some documentation.

PiperOrigin-RevId: 199200246
---
 .../lite/profiling/profile_summarizer.h       |   3 -
 tensorflow/contrib/lite/tools/BUILD           |  81 ---------
 tensorflow/contrib/lite/tools/benchmark/BUILD |  91 +++++++++
 .../contrib/lite/tools/benchmark/README.md    | 172 ++++++++++++++++++
 .../tools/{ => benchmark}/benchmark_main.cc   |   4 +-
 .../tools/{ => benchmark}/benchmark_model.cc  |   4 +-
 .../tools/{ => benchmark}/benchmark_model.h   |   4 +-
 .../{ => benchmark}/benchmark_tflite_model.cc |   4 +-
 .../{ => benchmark}/benchmark_tflite_model.h  |   4 +-
 .../{ => benchmark}/command_line_flags.cc     |  47 ++---
 .../{ => benchmark}/command_line_flags.h      |   2 +-
 .../command_line_flags_test.cc                |   2 +-
 .../lite/tools/{ => benchmark}/logging.h      |   3 +-
 tensorflow/core/BUILD                         |   1 -
 tensorflow/core/util/stat_summarizer.cc       |   8 +
 tensorflow/core/util/stat_summarizer.h        |   2 +-
 tensorflow/core/util/stats_calculator.cc      |  27 +--
 tensorflow/core/util/stats_calculator.h       |   3 -
 18 files changed, 321 insertions(+), 141 deletions(-)
 create mode 100644 tensorflow/contrib/lite/tools/benchmark/BUILD
 create mode 100644 tensorflow/contrib/lite/tools/benchmark/README.md
 rename tensorflow/contrib/lite/tools/{ => benchmark}/benchmark_main.cc (89%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/benchmark_model.cc (97%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/benchmark_model.h (97%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/benchmark_tflite_model.cc (98%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/benchmark_tflite_model.h (94%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/command_line_flags.cc (84%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/command_line_flags.h (98%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/command_line_flags_test.cc (98%)
 rename tensorflow/contrib/lite/tools/{ => benchmark}/logging.h (96%)

diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.h b/tensorflow/contrib/lite/profiling/profile_summarizer.h
index 6fe6ca04f5..a529ff8742 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.h
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.h
@@ -45,9 +45,6 @@ class ProfileSummarizer {
     return stats_calculator_->GetShortSummary();
   }
 
-  // Prints the string returned by GetOutputString().
-  void PrintStepStats() const { stats_calculator_->PrintStepStats(); }
-
  private:
   std::unique_ptr<tensorflow::StatsCalculator> stats_calculator_;
 };
diff --git a/tensorflow/contrib/lite/tools/BUILD b/tensorflow/contrib/lite/tools/BUILD
index 7fb7517600..5913847329 100644
--- a/tensorflow/contrib/lite/tools/BUILD
+++ b/tensorflow/contrib/lite/tools/BUILD
@@ -30,87 +30,6 @@ tf_cc_binary(
     ],
 )
 
-tf_cc_binary(
-    name = "benchmark_model",
-    srcs = [
-        "benchmark_main.cc",
-        "logging.h",
-    ],
-    copts = common_copts,
-    linkopts = select({
-        "//tensorflow:android": [
-            "-pie",
-            "-landroid",
-            "-lm",
-            "-z defs",
-            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
-        ],
-        "//conditions:default": [],
-    }),
-    deps = [
-        ":benchmark_tflite_model_lib",
-        "//tensorflow/core:stats_calculator_portable",
-    ],
-)
-
-cc_library(
-    name = "command_line_flags",
-    srcs = ["command_line_flags.cc"],
-    hdrs = ["command_line_flags.h"],
-    copts = common_copts,
-    visibility = ["//visibility:private"],
-)
-
-cc_test(
-    name = "command_line_flags_test",
-    srcs = ["command_line_flags_test.cc"],
-    copts = common_copts,
-    visibility = ["//visibility:private"],
-    deps = [
-        ":command_line_flags",
-        "//tensorflow/contrib/lite/testing:util",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "benchmark_tflite_model_lib",
-    srcs = [
-        "benchmark_tflite_model.cc",
-        "logging.h",
-    ],
-    hdrs = ["benchmark_tflite_model.h"],
-    copts = common_copts,
-    deps = [
-        ":benchmark_model_lib",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/profiling:profile_summarizer",
-        "//tensorflow/contrib/lite/profiling:profiler",
-    ],
-)
-
-cc_library(
-    name = "benchmark_model_lib",
-    srcs = [
-        "benchmark_model.cc",
-        "logging.h",
-    ],
-    hdrs = ["benchmark_model.h"],
-    copts = common_copts,
-    deps = [
-        ":command_line_flags",
-        "//tensorflow/contrib/lite:framework",
-        "//tensorflow/contrib/lite:string_util",
-        "//tensorflow/contrib/lite/kernels:builtin_ops",
-        "//tensorflow/contrib/lite/profiling:profile_summarizer",
-        "//tensorflow/contrib/lite/profiling:profiler",
-        "//tensorflow/contrib/lite/profiling:time",
-        "//tensorflow/core:stats_calculator_portable",
-    ],
-)
-
 cc_library(
     name = "gen_op_registration",
     srcs = ["gen_op_registration.cc"],
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
new file mode 100644
index 0000000000..4824a4dbde
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -0,0 +1,91 @@
+package(default_visibility = [
+    "//visibility:public",
+])
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
+
+common_copts = ["-Wall"]
+
+cc_binary(
+    name = "benchmark_model",
+    srcs = [
+        "benchmark_main.cc",
+        "logging.h",
+    ],
+    copts = common_copts,
+    linkopts = select({
+        "//tensorflow:android": [
+            "-pie",
+            "-landroid",
+            "-lm",
+            "-z defs",
+            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":benchmark_tflite_model_lib",
+    ],
+)
+
+cc_library(
+    name = "command_line_flags",
+    srcs = ["command_line_flags.cc"],
+    hdrs = ["command_line_flags.h"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+)
+
+cc_test(
+    name = "command_line_flags_test",
+    srcs = ["command_line_flags_test.cc"],
+    copts = common_copts,
+    visibility = ["//visibility:private"],
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/contrib/lite/testing:util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "benchmark_tflite_model_lib",
+    srcs = [
+        "benchmark_tflite_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_tflite_model.h"],
+    copts = common_copts,
+    deps = [
+        ":benchmark_model_lib",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
+        "//tensorflow/contrib/lite/profiling:profiler",
+    ],
+)
+
+cc_library(
+    name = "benchmark_model_lib",
+    srcs = [
+        "benchmark_model.cc",
+        "logging.h",
+    ],
+    hdrs = ["benchmark_model.h"],
+    copts = common_copts,
+    deps = [
+        ":command_line_flags",
+        "//tensorflow/contrib/lite:framework",
+        "//tensorflow/contrib/lite:string_util",
+        "//tensorflow/contrib/lite/kernels:builtin_ops",
+        "//tensorflow/contrib/lite/profiling:profile_summarizer",
+        "//tensorflow/contrib/lite/profiling:profiler",
+        "//tensorflow/contrib/lite/profiling:time",
+        "//tensorflow/core:stats_calculator_portable",
+    ],
+)
+
+tflite_portable_test_suite()
diff --git a/tensorflow/contrib/lite/tools/benchmark/README.md b/tensorflow/contrib/lite/tools/benchmark/README.md
new file mode 100644
index 0000000000..e6f333aa5b
--- /dev/null
+++ b/tensorflow/contrib/lite/tools/benchmark/README.md
@@ -0,0 +1,172 @@
+# TFLite Model Benchmark Tool
+
+## Description
+
+A simple C++ binary to benchmark a TFLite model and its individual operators,
+both on desktop machines and on Android.
+
+## To build/install/run
+
+### On Android:
+
+(0) Refer to https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android to edit the `WORKSPACE` to configure the android NDK/SDK.
+
+(1) Build for your specific platform, e.g.:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --cxxopt='--std=c++11' \
+  tensorflow/contrib/lite/tools/benchmark:benchmark_model
+```
+
+(2) Connect your phone. Push the binary to your phone with adb push
+     (make the directory if required):
+
+```
+adb push bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model /data/local/tmp
+```
+
+(3) Make the binary executable.
+
+```
+adb shell chmod +x /data/local/tmp/benchmark_model
+```
+
+(4) Push the compute graph that you need to test. For example:
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(5) Run the benchmark. For example:
+
+```
+adb shell /data/local/tmp/benchmark_model \
+  --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
+  --input_layer="Placeholder" \
+  --input_layer_shape="1,224,224,3" \
+  --input_layer_type="uint8" \
+  --output_layer="MobilenetV1/Predictions/Reshape_1" \
+  --num_threads=4
+```
+
+### On desktop:
+(1) build the binary
+
+```
+bazel build -c opt tensorflow/contrib/lite/tools/benchmark:benchmark_model
+```
+
+(2) Run on your compute graph, similar to the Android case but without the need of adb shell.
+For example:
+
+```
+bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model \
+  --graph=mobilenet_quant_v1_224.tflite \
+  --input_layer="Placeholder" \
+  --input_layer_shape="1,224,224,3" \
+  --input_layer_type="uint8" \
+  --output_layer="MobilenetV1/Predictions/Reshape_1" \
+  --num_threads=4
+```
+
+The MobileNet graph used as an example here may be downloaded from
+https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
+
+## Profiling model operators
+The benchmark model binary also allows you to profile operators and give execution times of each operator. To do this,
+compile the binary with a compiler flag that enables profiling to be compiled in. Pass **--copt=-DTFLITE_PROFILING_ENABLED**
+to compile benchmark with profiling support.
+For example, to compile with profiling support on Android, add this flag to the previous command:
+
+```
+bazel build -c opt \
+  --config=android_arm \
+  --cxxopt='--std=c++11' \
+  --copt=-DTFLITE_PROFILING_ENABLED \
+  tensorflow/contrib/lite/tools/benchmark:benchmark_model
+```
+This compiles TFLite with profiling enabled, now you can run the benchmark binary like before. The binary will produce detailed statistics for each operation similar to those shown below:
+
+```
+
+============================== Run Order ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 CONV_2D	    0.000	    9.132	    9.132	  0.121%	  0.121%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+	       DEPTHWISE_CONV_2D	    9.135	    3.280	    3.280	  0.043%	  0.165%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6]
+	                 CONV_2D	   12.419	    6.877	    6.877	  0.091%	  0.256%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   19.299	    1.708	    1.708	  0.023%	  0.278%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_depthwise/Relu6]
+	                 CONV_2D	   21.012	    4.162	    4.162	  0.055%	  0.334%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   25.177	    3.520	    3.520	  0.047%	  0.380%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_depthwise/Relu6]
+	                 CONV_2D	   28.701	   10.218	   10.218	  0.136%	  0.516%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   38.922	    0.827	    0.827	  0.011%	  0.527%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_depthwise/Relu6]
+	                 CONV_2D	   39.752	    1.401	    1.401	  0.019%	  0.545%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   41.156	    1.290	    1.290	  0.017%	  0.563%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6]
+	                 CONV_2D	   42.448	    5.995	    5.995	  0.080%	  0.642%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   48.445	    0.409	    0.409	  0.005%	  0.647%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  0.729%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   55.026	    0.629	    0.629	  0.008%	  0.738%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
+	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.823%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   62.124	    0.647	    0.647	  0.009%	  0.832%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
+	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  1.026%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   77.444	    0.635	    0.635	  0.008%	  1.035%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
+	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  1.130%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   85.270	    0.646	    0.646	  0.009%	  1.139%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6]
+	                 CONV_2D	   85.918	    9.529	    9.529	  0.126%	  1.265%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   95.451	    0.628	    0.628	  0.008%	  1.273%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6]
+	                 CONV_2D	   96.081	    2.077	    2.077	  0.028%	  1.301%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   98.162	    0.168	    0.168	  0.002%	  1.303%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6]
+	                 CONV_2D	   98.332	    1.007	    1.007	  0.013%	  1.317%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   99.342	    0.288	    0.288	  0.004%	  1.320%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6]
+	                 CONV_2D	   99.632	    8.197	    8.197	  0.109%	  1.429%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	         AVERAGE_POOL_2D	  107.832	    0.045	    0.045	  0.001%	  1.430%	     0.000	        0	[MobilenetV1/Logits/AvgPool_1a/AvgPool]
+	                 CONV_2D	  107.878	    0.325	    0.325	  0.004%	  1.434%	     0.000	        0	[MobilenetV1/Logits/Conv2d_1c_1x1/BiasAdd]
+	                 RESHAPE	  108.206	    0.003	    0.003	  0.000%	  1.434%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
+	                 SOFTMAX	  108.211	    0.038	    0.038	  0.001%	  1.434%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
+
+============================== Top by Computation Time ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  0.195%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	                 CONV_2D	   28.701	   10.218	   10.218	  0.136%	  0.330%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	                 CONV_2D	   85.918	    9.529	    9.529	  0.126%	  0.456%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	                 CONV_2D	    0.000	    9.132	    9.132	  0.121%	  0.578%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+	                 CONV_2D	   99.632	    8.197	    8.197	  0.109%	  0.686%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  0.782%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	                 CONV_2D	   12.419	    6.877	    6.877	  0.091%	  0.873%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.958%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  1.040%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	                 CONV_2D	   42.448	    5.995	    5.995	  0.080%	  1.120%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+
+============================== Top by Memory Use ==============================
+	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
+	                 SOFTMAX	  108.211	    0.038	    0.038	  0.001%	  0.001%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
+	                 RESHAPE	  108.206	    0.003	    0.003	  0.000%	  0.001%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
+	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  0.096%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   77.444	    0.635	    0.635	  0.008%	  0.104%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
+	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  0.299%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   62.124	    0.647	    0.647	  0.009%	  0.307%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
+	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.393%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   55.026	    0.629	    0.629	  0.008%	  0.401%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
+	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  0.483%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   48.445	    0.409	    0.409	  0.005%	  0.489%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+
+Number of nodes executed: 31
+============================== Summary by node type ==============================
+	             [Node type]	  [count]	  [avg ms]	    [avg %]	    [cdf %]	  [mem KB]	[times called]
+	                 CONV_2D	       15	     1.861	    86.679%	    86.679%	     0.000	        0
+	       DEPTHWISE_CONV_2D	       13	     0.286	    13.321%	   100.000%	     0.000	        0
+	                 SOFTMAX	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+	                 RESHAPE	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+	         AVERAGE_POOL_2D	        1	     0.000	     0.000%	   100.000%	     0.000	        0
+
+Timings (microseconds): count=50 first=108164 curr=128308 min=102850 max=197072 avg=150805 std=24368
+Memory (bytes): count=0
+31 nodes observed
+
+
+Average inference timings in us: Warmup: 135310, Init: 12123, no stats: 150988
+
+```
+
+
diff --git a/tensorflow/contrib/lite/tools/benchmark_main.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
similarity index 89%
rename from tensorflow/contrib/lite/tools/benchmark_main.cc
rename to tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
index 1325385e32..372d31e838 100644
--- a/tensorflow/contrib/lite/tools/benchmark_main.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_main.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark_tflite_model.h"
-#include "tensorflow/contrib/lite/tools/logging.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
 
 namespace tflite {
 namespace benchmark {
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
similarity index 97%
rename from tensorflow/contrib/lite/tools/benchmark_model.cc
rename to tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
index 550994c662..a8a9a6112c 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
 
 #include <time.h>
 
@@ -21,7 +21,7 @@ limitations under the License.
 #include <sstream>
 
 #include "tensorflow/contrib/lite/profiling/time.h"
-#include "tensorflow/contrib/lite/tools/logging.h"
+#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
 
 namespace {
 void SleepForSeconds(double sleep_seconds) {
diff --git a/tensorflow/contrib/lite/tools/benchmark_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
similarity index 97%
rename from tensorflow/contrib/lite/tools/benchmark_model.h
rename to tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
index ef8d6a7d1e..d48f693693 100644
--- a/tensorflow/contrib/lite/tools/benchmark_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_model.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
-#include "tensorflow/contrib/lite/tools//command_line_flags.h"
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
 #include "tensorflow/core/util/stats_calculator.h"
 
 namespace tflite {
@@ -158,4 +158,4 @@ class BenchmarkModel {
 }  // namespace benchmark
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_MODEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/benchmark_tflite_model.cc
rename to tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index be8f46f599..2e5b866273 100644
--- a/tensorflow/contrib/lite/tools/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/benchmark_tflite_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h"
 
 #include <cstdarg>
 #include <cstdlib>
@@ -27,7 +27,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/op_resolver.h"
 #include "tensorflow/contrib/lite/string_util.h"
-#include "tensorflow/contrib/lite/tools/logging.h"
+#include "tensorflow/contrib/lite/tools/benchmark/logging.h"
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
 void RegisterSelectedOps(::tflite::MutableOpResolver* resolver);
diff --git a/tensorflow/contrib/lite/tools/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
similarity index 94%
rename from tensorflow/contrib/lite/tools/benchmark_tflite_model.h
rename to tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index e6d03d5211..e70f6de1bf 100644
--- a/tensorflow/contrib/lite/tools/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -22,7 +22,7 @@ limitations under the License.
 
 #include "tensorflow/contrib/lite/model.h"
 #include "tensorflow/contrib/lite/profiling/profile_summarizer.h"
-#include "tensorflow/contrib/lite/tools/benchmark_model.h"
+#include "tensorflow/contrib/lite/tools/benchmark/benchmark_model.h"
 
 namespace tflite {
 namespace benchmark {
@@ -87,4 +87,4 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
 }  // namespace benchmark
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_TFLITE_MODEL_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_BENCHMARK_TFLITE_MODEL_H_
diff --git a/tensorflow/contrib/lite/tools/command_line_flags.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
similarity index 84%
rename from tensorflow/contrib/lite/tools/command_line_flags.cc
rename to tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
index ba72f40689..723bf67e03 100644
--- a/tensorflow/contrib/lite/tools/command_line_flags.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
@@ -10,8 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/command_line_flags.h"
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
 
+#include <cstring>
 #include <sstream>
 #include <string>
 #include <vector>
@@ -19,6 +20,13 @@ limitations under the License.
 namespace tflite {
 namespace {
 
+template <typename T>
+std::string ToString(T val) {
+  std::ostringstream stream;
+  stream << val;
+  return stream.str();
+}
+
 bool ParseFlag(const std::string& arg, const std::string& flag,
                const std::function<bool(const std::string&)>& parse_func,
                bool* value_parsing_ok) {
@@ -35,14 +43,16 @@ bool ParseFlag(const std::string& arg, const std::string& flag,
   return true;
 }
 
-bool ParseInt32Flag(const std::string& flag_value, int32_t* value) {
-  char extra;
-  return sscanf(flag_value.data(), "%d%c", value, &extra) == 1;
-}
-
-bool ParseInt64Flag(const std::string& flag_value, int64_t* value) {
-  char extra;
-  return sscanf(flag_value.data(), "%ld%c", value, &extra) == 1;
+template <typename T>
+bool ParseFlag(const std::string& flag_value, T* value) {
+  std::istringstream stream(flag_value);
+  T read_value;
+  stream >> read_value;
+  if (!stream.eof() && !stream.good()) {
+    return false;
+  }
+  *value = read_value;
+  return true;
 }
 
 bool ParseBoolFlag(const std::string& flag_value, bool* value) {
@@ -54,11 +64,6 @@ bool ParseBoolFlag(const std::string& flag_value, bool* value) {
   return true;
 }
 
-bool ParseFloatFlag(const std::string& flag_value, float* value) {
-  char extra;
-  return sscanf(flag_value.data(), "%f%c", value, &extra) == 1;
-}
-
 bool ParseStringFlag(const std::string& flag_value, std::string* value) {
   *value = flag_value;
   return true;
@@ -70,27 +75,27 @@ Flag::Flag(const char* name, int32_t* dst, const std::string& usage_text)
     : name_(name),
       type_(TYPE_INT32),
       value_hook_([dst](const std::string& flag_value) {
-        return ParseInt32Flag(flag_value, dst);
+        return ParseFlag<int32_t>(flag_value, dst);
       }),
-      default_for_display_(std::to_string(*dst)),
+      default_for_display_(ToString(*dst)),
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, int64_t* dst, const std::string& usage_text)
     : name_(name),
       type_(TYPE_INT64),
       value_hook_([dst](const std::string& flag_value) {
-        return ParseInt64Flag(flag_value, dst);
+        return ParseFlag<int64_t>(flag_value, dst);
       }),
-      default_for_display_(std::to_string(*dst)),
+      default_for_display_(ToString(*dst)),
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, float* dst, const std::string& usage_text)
     : name_(name),
       type_(TYPE_FLOAT),
       value_hook_([dst](const std::string& flag_value) {
-        return ParseFloatFlag(flag_value, dst);
+        return ParseFlag<float>(flag_value, dst);
       }),
-      default_for_display_(std::to_string(*dst)),
+      default_for_display_(ToString(*dst)),
       usage_text_(usage_text) {}
 
 Flag::Flag(const char* name, bool* dst, const std::string& usage_text)
@@ -166,7 +171,7 @@ std::string Flag::GetTypeName() const {
   }
   argv[dst++] = nullptr;
   *argc = unknown_flags.size() + 1;
-  return result && (*argc < 2 || strcmp(argv[1], "--help") != 0);
+  return result && (*argc < 2 || std::strcmp(argv[1], "--help") != 0);
 }
 
 /*static*/ std::string Flags::Usage(const std::string& cmdline,
diff --git a/tensorflow/contrib/lite/tools/command_line_flags.h b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
similarity index 98%
rename from tensorflow/contrib/lite/tools/command_line_flags.h
rename to tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
index 0605d3c9d4..36f9e64767 100644
--- a/tensorflow/contrib/lite/tools/command_line_flags.h
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.h
@@ -109,4 +109,4 @@ class Flags {
 
 }  // namespace tflite
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_COMMAND_LINE_FLAGS_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_COMMAND_LINE_FLAGS_H_
diff --git a/tensorflow/contrib/lite/tools/command_line_flags_test.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
similarity index 98%
rename from tensorflow/contrib/lite/tools/command_line_flags_test.cc
rename to tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
index 463647bec9..74cf59105b 100644
--- a/tensorflow/contrib/lite/tools/command_line_flags_test.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/contrib/lite/tools/command_line_flags.h"
+#include "tensorflow/contrib/lite/tools/benchmark/command_line_flags.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/contrib/lite/testing/util.h"
diff --git a/tensorflow/contrib/lite/tools/logging.h b/tensorflow/contrib/lite/tools/benchmark/logging.h
similarity index 96%
rename from tensorflow/contrib/lite/tools/logging.h
rename to tensorflow/contrib/lite/tools/benchmark/logging.h
index aa1fa5b827..9e9292e2fe 100644
--- a/tensorflow/contrib/lite/tools/logging.h
+++ b/tensorflow/contrib/lite/tools/benchmark/logging.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 // LOG and CHECK macros for benchmarks.
 
+#include <cstdlib>
 #include <iostream>
 #include <sstream>
 
@@ -72,4 +73,4 @@ class LoggingWrapper {
 
 #define TFLITE_BENCHMARK_CHECK_EQ(a, b) TFLITE_BENCHMARK_CHECK(a == b)
 
-#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_LOGGING_H_
+#endif  // TENSORFLOW_CONTRIB_LITE_TOOLS_BENCHMARK_LOGGING_H_
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 7e13a07e5e..6bde2a0a4a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -876,7 +876,6 @@ cc_library(
     hdrs = [
         "util/stats_calculator.h",
     ],
-    deps = [":platform_base"],
 )
 
 cc_library(
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index 42a4801dcb..a5c1fda102 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -78,6 +78,14 @@ void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
   }
 }
 
+void StatSummarizer::PrintStepStats() const {
+  string output = GetOutputString();
+  std::istringstream iss(output);
+  for (std::string line; std::getline(iss, line);) {
+    LOG(INFO) << line;
+  }
+}
+
 namespace {
 std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
   // There is no published specification of how DeviceStats and NodeStats
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 173ed5cebc..7e6d6f6372 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -68,7 +68,7 @@ class StatSummarizer {
   }
 
   // Prints the string returned by GetOutputString().
-  void PrintStepStats() const { stats_calculator_->PrintStepStats(); }
+  void PrintStepStats() const;
 
   // Prints the output tensor sizes and types for each node.
   void PrintOutputs() const;
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index 20353ec76e..c4befbdb84 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -21,8 +21,6 @@ limitations under the License.
 #include <sstream>
 #include <string>
 
-#include "tensorflow/core/platform/logging.h"
-
 namespace tensorflow {
 
 StatsCalculator::StatsCalculator(const StatSummarizerOptions& options)
@@ -93,7 +91,7 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
 
 void StatsCalculator::OrderNodesByMetric(
     SortingMetric metric, std::vector<const Detail*>* details) const {
-  std::priority_queue<std::pair<string, const Detail*>> sorted_list;
+  std::priority_queue<std::pair<std::string, const Detail*>> sorted_list;
   const int num_nodes = details_.size();
 
   for (const auto& det : details_) {
@@ -142,7 +140,7 @@ void StatsCalculator::ComputeStatsByType(
   int64_t run_count = run_total_us_.count();
 
   for (const auto& det : details_) {
-    const string node_name = det.first;
+    const std::string node_name = det.first;
     const Detail& detail = det.second;
 
     int64_t curr_time_val =
@@ -151,7 +149,7 @@ void StatsCalculator::ComputeStatsByType(
 
     int64_t curr_memory_val = detail.mem_used.newest();
 
-    const string& node_type = detail.type;
+    const std::string& node_type = detail.type;
 
     (*node_type_map_count)[node_type] += 1;
     (*node_type_map_time)[node_type] += curr_time_val;
@@ -163,12 +161,12 @@ void StatsCalculator::ComputeStatsByType(
 std::string StatsCalculator::GetStatsByNodeType() const {
   std::stringstream stream;
 
+  stream << "Number of nodes executed: " << details_.size() << std::endl;
+
   stream << "============================== Summary by node type "
             "=============================="
          << std::endl;
 
-  LOG(INFO) << "Number of nodes executed: " << details_.size();
-
   std::map<std::string, int64_t> node_type_map_count;
   std::map<std::string, int64_t> node_type_map_time;
   std::map<std::string, int64_t> node_type_map_memory;
@@ -180,11 +178,12 @@ std::string StatsCalculator::GetStatsByNodeType() const {
                      &accumulated_us);
 
   // Sort them.
-  std::priority_queue<std::pair<int64_t, std::pair<string, int64_t>>> timings;
+  std::priority_queue<std::pair<int64_t, std::pair<std::string, int64_t>>>
+      timings;
   for (const auto& node_type : node_type_map_time) {
     const int64_t mem_used = node_type_map_memory[node_type.first];
     timings.emplace(node_type.second,
-                    std::pair<string, int64_t>(node_type.first, mem_used));
+                    std::pair<std::string, int64_t>(node_type.first, mem_used));
   }
 
   InitField(stream, 24) << "[Node type]";
@@ -201,7 +200,7 @@ std::string StatsCalculator::GetStatsByNodeType() const {
     auto entry = timings.top();
     timings.pop();
 
-    const string node_type = entry.second.first;
+    const std::string node_type = entry.second.first;
     const float memory = entry.second.second / 1000.0f;
 
     const int64_t node_type_total_us = entry.first;
@@ -273,14 +272,6 @@ std::string StatsCalculator::GetOutputString() const {
   return stream.str();
 }
 
-void StatsCalculator::PrintStepStats() const {
-  string output = GetOutputString();
-  std::istringstream iss(output);
-  for (std::string line; std::getline(iss, line);) {
-    LOG(INFO) << line;
-  }
-}
-
 void StatsCalculator::UpdateDetails(
     const std::map<std::string, Detail>& details) {
   details_.insert(details.begin(), details.end());
diff --git a/tensorflow/core/util/stats_calculator.h b/tensorflow/core/util/stats_calculator.h
index a1033465fb..39cef816f1 100644
--- a/tensorflow/core/util/stats_calculator.h
+++ b/tensorflow/core/util/stats_calculator.h
@@ -127,9 +127,6 @@ class StatsCalculator {
 
   std::string GetShortSummary() const;
 
-  // Prints the string returned by GetOutputString().
-  void PrintStepStats() const;
-
   void ComputeStatsByType(
       std::map<std::string, int64_t>* node_type_map_count,
       std::map<std::string, int64_t>* node_type_map_time,
-- 
GitLab


From d947e2c172b2eee4338e598a51d80d519907f991 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 4 Jun 2018 15:00:15 -0700
Subject: [PATCH 278/610] Remove tf_export decorator from contrib. tf_export
 decorators currently aren't supported in contrib.

PiperOrigin-RevId: 199200258
---
 tensorflow/contrib/distributions/python/ops/kumaraswamy.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
index 66682b2ff5..0ff989fc95 100644
--- a/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
+++ b/tensorflow/contrib/distributions/python/ops/kumaraswamy.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import special_math_ops
 from tensorflow.python.ops.distributions import distribution
 from tensorflow.python.ops.distributions import transformed_distribution
 from tensorflow.python.ops.distributions import uniform
-from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
     "Kumaraswamy",
@@ -59,7 +58,6 @@ def _harmonic_number(x):
   return math_ops.digamma(x + one) - math_ops.digamma(one)
 
 
-@tf_export("distributions.Kumaraswamy")
 class Kumaraswamy(transformed_distribution.TransformedDistribution):
   """Kumaraswamy distribution.
 
-- 
GitLab


From 18995ecf1a0c4a161b296fbafe63289e90437807 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 15:19:39 -0700
Subject: [PATCH 279/610] Adds update_ops to train_op for all heads.

PiperOrigin-RevId: 199203634
---
 tensorflow/contrib/estimator/BUILD            |  1 +
 .../estimator/python/estimator/head.py        |  1 +
 .../estimator/python/estimator/head_test.py   | 29 +++++++
 tensorflow/python/estimator/BUILD             |  1 +
 tensorflow/python/estimator/canned/head.py    | 11 +++
 .../python/estimator/canned/head_test.py      | 86 +++++++++++++++++++
 6 files changed, 129 insertions(+)

diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 47c7b7fc19..1937ffb583 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -312,6 +312,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variables",
         "//tensorflow/python/estimator:metric_keys",
         "//tensorflow/python/estimator:model_fn",
         "//tensorflow/python/estimator:prediction_keys",
diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index 8b97f86db1..b798769d2c 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -845,6 +845,7 @@ class _MultiLabelHead(head_lib._Head):  # pylint:disable=protected-access
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = head_lib._append_update_ops(train_op)  # pylint:disable=protected-access
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
diff --git a/tensorflow/contrib/estimator/python/estimator/head_test.py b/tensorflow/contrib/estimator/python/estimator/head_test.py
index d6c158608b..b2b57fa06b 100644
--- a/tensorflow/contrib/estimator/python/estimator/head_test.py
+++ b/tensorflow/contrib/estimator/python/estimator/head_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -989,6 +990,34 @@ class MultiLabelHead(test.TestCase):
           six.b('{0:s}{1:.3f}'.format(expected_train_result, expected_loss)),
           train_result)
 
+  def test_train_with_update_ops(self):
+    head = head_lib.multi_label_head(n_classes=2)
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array([[-10., 10.], [-15., 10.]], dtype=np.float32),
+          labels=np.array([[1, 0], [1, 1]], dtype=np.int64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_with_regularization_losses(self):
     head = head_lib.multi_label_head(
         n_classes=2, loss_reduction=losses.Reduction.SUM_OVER_BATCH_SIZE)
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index 9c4d58b177..d538c6c415 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -709,6 +709,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:variables",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/saved_model:signature_constants",
diff --git a/tensorflow/python/estimator/canned/head.py b/tensorflow/python/estimator/canned/head.py
index 04fe4d97e4..b74ef1015c 100644
--- a/tensorflow/python/estimator/canned/head.py
+++ b/tensorflow/python/estimator/canned/head.py
@@ -873,6 +873,7 @@ class _MultiClassHeadWithSoftmaxCrossEntropyLoss(_Head):
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = _append_update_ops(train_op)
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1244,6 +1245,7 @@ class _BinaryLogisticHeadWithSigmoidCrossEntropyLoss(_Head):
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = _append_update_ops(train_op)
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1506,6 +1508,7 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         train_op = train_op_fn(regularized_training_loss)
       else:
         raise ValueError('train_op_fn and optimizer cannot both be None.')
+      train_op = _append_update_ops(train_op)
       # Only summarize mean_loss for SUM reduction to preserve backwards
       # compatibility. Otherwise skip it to avoid unnecessary computation.
       if self._loss_reduction == losses.Reduction.SUM:
@@ -1533,6 +1536,14 @@ class _RegressionHeadWithMeanSquaredErrorLoss(_Head):
         train_op=train_op)
 
 
+def _append_update_ops(train_op):
+  """Returns `train_op` appending `UPDATE_OPS` collection if present."""
+  update_ops = ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+  if update_ops:
+    return control_flow_ops.group(train_op, *update_ops)
+  return train_op
+
+
 def _assert_range(labels, n_classes, message=None):
   with ops.name_scope(None, 'assert_range', (labels,)):
     assert_less = check_ops.assert_less_equal(
diff --git a/tensorflow/python/estimator/canned/head_test.py b/tensorflow/python/estimator/canned/head_test.py
index ecca3e8b0d..08ce5ca8e8 100644
--- a/tensorflow/python/estimator/canned/head_test.py
+++ b/tensorflow/python/estimator/canned/head_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
@@ -969,6 +970,35 @@ class MultiClassHeadWithSoftmaxCrossEntropyLoss(test.TestCase):
           six.b('{0:s}{1:.2f}'.format(expected_train_result, expected_loss)),
           train_result)
 
+  def test_train_with_update_ops(self):
+    n_classes = 3
+    head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(n_classes)
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array(((10, 0, 0), (0, 10, 0),), dtype=np.float32),
+          labels=np.array(((1,), (1,)), dtype=np.int64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_summaries_with_head_name(self):
     n_classes = 3
     head = head_lib._multi_class_head_with_softmax_cross_entropy_loss(
@@ -2102,6 +2132,34 @@ class BinaryLogisticHeadWithSigmoidCrossEntropyLossTest(test.TestCase):
       self.assertAllClose(expected_loss, loss)
       self.assertEqual(expected_train_result, train_result)
 
+  def test_train_with_update_ops(self):
+    head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss()
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array(((45,), (-41,),), dtype=np.float32),
+          labels=np.array(((1,), (1,),), dtype=np.float64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._binary_logistic_head_with_sigmoid_cross_entropy_loss(
         name='some_binary_head')
@@ -3278,6 +3336,34 @@ class RegressionHead(test.TestCase):
       self.assertAllClose(expected_loss, loss)
       self.assertEqual(expected_train_result, train_result)
 
+  def test_train_with_update_ops(self):
+    head = head_lib._regression_head()
+
+    with ops.Graph().as_default():
+      w = variables.Variable(1)
+      update_op = w.assign_add(1)
+      ops.add_to_collection(ops.GraphKeys.UPDATE_OPS, update_op)
+
+      t = variables.Variable('')
+      expected_train_result = b'my_train_op'
+      def _train_op_fn(loss):
+        del loss
+        return t.assign(expected_train_result)
+
+      spec = head.create_estimator_spec(
+          features={'x': np.array(((42,),), dtype=np.int32)},
+          mode=model_fn.ModeKeys.TRAIN,
+          logits=np.array(((45,), (41,),), dtype=np.float32),
+          labels=np.array(((43.,), (44.,),), dtype=np.float64),
+          train_op_fn=_train_op_fn)
+
+      with self.test_session() as sess:
+        _initialize_variables(self, spec.scaffold)
+        sess.run(spec.train_op)
+        w_value, t_value = sess.run([w, t])
+        self.assertEqual(2, w_value)
+        self.assertEqual(expected_train_result, t_value)
+
   def test_train_summaries_with_head_name(self):
     head = head_lib._regression_head(name='some_regression_head')
     self.assertEqual(1, head.logits_dimension)
-- 
GitLab


From eab2e4d784036568de076317ee40b25dc19eb4a9 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Mon, 4 Jun 2018 15:30:59 -0700
Subject: [PATCH 280/610] nit: FlatBuffer -> FrozenGraph

PiperOrigin-RevId: 199205459
---
 tensorflow/contrib/lite/python/lite_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 5f8dfc0dc1..019a3a5f69 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -292,7 +292,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
 
-class FromFlatbufferFile(test_util.TensorFlowTestCase):
+class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testFloat(self):
     in_tensor = array_ops.placeholder(
-- 
GitLab


From 69613d25c3f82652c636c5a1c1b42029dc427979 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 4 Jun 2018 15:35:58 -0700
Subject: [PATCH 281/610] More handle_data fixing.

I'm not sure why our existing tests didn't catch this...

PiperOrigin-RevId: 199206183
---
 tensorflow/python/framework/function.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 259cab6699..79ee57355d 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -720,6 +720,8 @@ class _FuncGraph(ops.Graph):
     if ops._USE_C_SHAPES:
       if isinstance(tensor, ops.EagerTensor):
         handle_data = tensor._handle_data
+        if handle_data:
+          handle_data = handle_data.SerializeToString()
       else:
         handle_data = c_api.GetResourceHandleShapeAndType(
             tensor.graph._c_graph, tensor._as_tf_output())
-- 
GitLab


From cf01d118ef0762c0554611bef123bf4559071fbf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 15:51:17 -0700
Subject: [PATCH 282/610] Add support for kDomain parsing in HLO parser.

PiperOrigin-RevId: 199208527
---
 tensorflow/compiler/xla/service/BUILD         |  1 +
 .../compiler/xla/service/hlo_instruction.cc   | 10 ++--
 tensorflow/compiler/xla/service/hlo_parser.cc | 56 ++++++++++++++++++-
 .../compiler/xla/service/hlo_parser_test.cc   | 11 ++++
 4 files changed, 71 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index c5b637419c..75961d49a5 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2980,6 +2980,7 @@ cc_library(
     deps = [
         ":hlo",
         ":hlo_lexer",
+        ":hlo_sharding_metadata",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 4095b3d337..1c276b9305 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -2441,12 +2441,10 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(StrCat("exponent_bits=", exponent_bits_));
     extra.push_back(StrCat("mantissa_bits=", mantissa_bits_));
   }
-  if (operand_side_metadata_ != nullptr) {
-    extra.push_back(
-        StrCat("operand_side=", operand_side_metadata_->ToString()));
-  }
-  if (user_side_metadata_ != nullptr) {
-    extra.push_back(StrCat("user_side=", user_side_metadata_->ToString()));
+  if (operand_side_metadata_ != nullptr && user_side_metadata_ != nullptr) {
+    extra.push_back(StrCat("domain={kind=\"", operand_side_metadata_->Kind(),
+                           "\", entry=", operand_side_metadata_->ToString(),
+                           ", exit=", user_side_metadata_->ToString(), "}"));
   }
   // By contract, we print the custom call target even if
   // options.print_subcomputation_mode() == kOff, because the call target is not
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index cefc6ff915..09c05c9821 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_domain_metadata.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_sharding_metadata.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -107,6 +109,12 @@ class HloParser {
     std::vector<tensorflow::int64> strides;
   };
 
+  // The data parsed for the kDomain instruction.
+  struct DomainData {
+    std::unique_ptr<DomainMetadata> entry_metadata;
+    std::unique_ptr<DomainMetadata> exit_metadata;
+  };
+
   // Types of attributes.
   enum class AttrTy {
     kInt64,
@@ -125,6 +133,7 @@ class HloParser {
     kMetadata,
     kFusionKind,
     kDistribution,
+    kDomain,
   };
 
   struct AttrConfig {
@@ -181,6 +190,9 @@ class HloParser {
   bool ParseSharding(OpSharding* sharding);
   bool ParseSingleSharding(OpSharding* sharding, bool lbrace_pre_lexed);
 
+  // Parses the metadata behind a kDOmain instruction.
+  bool ParseDomain(DomainData* domain);
+
   // Parses a sub-attribute of the window attribute, e.g.,size=1x2x3.
   bool ParseDxD(const string& name, std::vector<tensorflow::int64>* result);
   // Parses window's pad sub-attriute, e.g., pad=0_0x3x3.
@@ -492,7 +504,6 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
     case HloOpcode::kClz:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
-    case HloOpcode::kDomain:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kImag:
@@ -1106,6 +1117,18 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           dim_numbers, *window_bounds));
       break;
     }
+    case HloOpcode::kDomain: {
+      DomainData domain;
+      attrs["domain"] = {/*required=*/true, AttrTy::kDomain, &domain};
+      if (!ParseOperands(&operands, /*expected_size=*/1) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(HloInstruction::CreateDomain(
+          shape, operands[0], std::move(domain.entry_metadata),
+          std::move(domain.exit_metadata)));
+      break;
+    }
     case HloOpcode::kTrace:
       return TokenError(StrCat("parsing not yet implemented for op: ",
                                HloOpcodeString(opcode)));
@@ -1293,6 +1316,34 @@ bool HloParser::ParseSingleSharding(OpSharding* sharding,
   return true;
 }
 
+// domain ::= '{' 'kind=' domain_kind ',' 'entry=' entry_sharding ','
+//            'exit=' exit_sharding '}'
+bool HloParser::ParseDomain(DomainData* domain) {
+  std::unordered_map<string, AttrConfig> attrs;
+  optional<string> kind;
+  optional<OpSharding> entry_sharding;
+  optional<OpSharding> exit_sharding;
+  attrs["kind"] = {/*required=*/true, AttrTy::kString, &kind};
+  attrs["entry"] = {/*required=*/true, AttrTy::kSharding, &entry_sharding};
+  attrs["exit"] = {/*required=*/true, AttrTy::kSharding, &exit_sharding};
+  if (!ParseSubAttributes(attrs)) {
+    return false;
+  }
+  if (*kind == ShardingMetadata::KindName()) {
+    auto entry_sharding_ptr = MakeUnique<HloSharding>(
+        HloSharding::FromProto(*entry_sharding).ValueOrDie());
+    auto exit_sharding_ptr = MakeUnique<HloSharding>(
+        HloSharding::FromProto(*exit_sharding).ValueOrDie());
+    domain->entry_metadata =
+        MakeUnique<ShardingMetadata>(std::move(entry_sharding_ptr));
+    domain->exit_metadata =
+        MakeUnique<ShardingMetadata>(std::move(exit_sharding_ptr));
+  } else {
+    return TokenError(StrCat("unsupported domain kind: ", *kind));
+  }
+  return true;
+}
+
 // '{' name+ '}'
 bool HloParser::ParseInstructionNames(
     std::vector<HloInstruction*>* instructions) {
@@ -2043,6 +2094,9 @@ bool HloParser::ParseAttributeHelper(
             ->emplace(result);
         return true;
       }
+      case AttrTy::kDomain: {
+        return ParseDomain(static_cast<DomainData*>(attr_out_ptr));
+      }
     }
   }();
   if (!success) {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 9a18b4f845..84a981675f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -234,6 +234,17 @@ ENTRY %ShardedTupleCreate.v4 (v1: f32[], v2: f32[3], v3: f32[2,3]) -> (f32[], f3
   ROOT %tuple = (f32[], f32[3]{0}, f32[2,3]{1,0}) tuple(f32[] %v1, f32[3]{0} %v2, f32[2,3]{1,0} %v3), sharding={{replicated}, {maximal device=0}, {replicated}}
 }
 
+)"
+},
+{
+"DomainParsing",
+R"(HloModule DomainParsing_module
+
+ENTRY %DomainParsing (v1: f32[]) -> f32[] {
+  %v1 = f32[] parameter(0)
+  ROOT %dom = f32[] domain(f32[] %v1), domain={kind="sharding", entry={maximal device=0}, exit={maximal device=1}}
+}
+
 )"
 },
 // int32 result = 0;
-- 
GitLab


From 14d4d1634dd2bd70ebc1629bc27354309bce0cb4 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Mon, 4 Jun 2018 16:41:46 -0700
Subject: [PATCH 283/610] Add TOKEN primitive type. The token type will be
 threaded through side-effecting ops to order them. Subsequent cls will add
 new opcodes and change side effecting operations to support this ordering.

This CL also does some cleanup in shape_util and layout_util where we have assumed that shapes are either arrays or tuples.

PiperOrigin-RevId: 199215963
---
 tensorflow/compiler/xla/layout_util.cc      |  53 ++--
 tensorflow/compiler/xla/layout_util_test.cc |  51 ++++
 tensorflow/compiler/xla/shape_util.cc       | 263 ++++++++++++--------
 tensorflow/compiler/xla/shape_util.h        |  26 +-
 tensorflow/compiler/xla/shape_util_test.cc  |  49 +++-
 tensorflow/compiler/xla/xla_data.proto      |  11 +-
 6 files changed, 304 insertions(+), 149 deletions(-)

diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 89cafa1a7d..e8f29b8329 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -98,8 +98,13 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }  // namespace
 
 /* static */ Layout LayoutUtil::GetDefaultLayoutForShape(const Shape& shape) {
+  if (ShapeUtil::IsOpaque(shape) || ShapeUtil::IsToken(shape)) {
+    // Opaque and token types have empty layouts.
+    return Layout();
+  }
+
   // A Layout proto corresponds to a single array, not a tuple.
-  DCHECK(!ShapeUtil::IsTuple(shape));
+  CHECK(ShapeUtil::IsArray(shape));
   return CreateDefaultLayoutForRank(shape.dimensions_size());
 }
 
@@ -126,14 +131,15 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       SetToDefaultLayout(&element_shape);
     }
     shape->clear_layout();
-  } else if (ShapeUtil::IsOpaque(*shape)) {
-    shape->clear_layout();
-  } else {
+  } else if (ShapeUtil::IsArray(*shape)) {
     shape->mutable_layout()->set_format(DENSE);
     tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>*
         minor_to_major = shape->mutable_layout()->mutable_minor_to_major();
     minor_to_major->Resize(shape->dimensions_size(), 0);
     SetDefaultLayoutToContainer(minor_to_major);
+  } else {
+    // Opaque, token types etc. have no layout.
+    shape->clear_layout();
   }
 }
 
@@ -160,18 +166,20 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
       TF_RETURN_IF_ERROR(ValidateLayoutInShape(element_shape));
     }
     return Status::OK();
-  } else if (ShapeUtil::IsOpaque(shape)) {
-    if (shape.has_layout()) {
-      return InvalidArgument("opaque should not have a layout field");
-    }
-    return Status::OK();
-  } else {
-    // Array shape.
+  } else if (ShapeUtil::IsArray(shape)) {
     if (!shape.has_layout()) {
       return InvalidArgument("shape %s does not have a layout",
                              ShapeUtil::HumanString(shape).c_str());
     }
     return ValidateLayoutForShape(shape.layout(), shape);
+  } else {
+    // Token, opaque, etc. shape.
+    if (shape.has_layout()) {
+      return InvalidArgument(
+          "shape of primitive type %s should not have a layout",
+          PrimitiveType_Name(shape.element_type()).c_str());
+    }
+    return Status::OK();
   }
 }
 
@@ -181,8 +189,10 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     return InvalidArgument("a single Layout is not valid for tuple shapes");
   }
 
-  if (ShapeUtil::IsOpaque(shape)) {
-    return Status::OK();
+  if (!ShapeUtil::IsArray(shape)) {
+    return InvalidArgument(
+        "shape of primitive type %s should not have a layout",
+        PrimitiveType_Name(shape.element_type()).c_str());
   }
 
   if (layout.format() == INVALID_FORMAT) {
@@ -273,7 +283,7 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
 }
 
 /* static */ bool LayoutUtil::IsPadded(const Shape& shape) {
-  if (ShapeUtil::IsTuple(shape) || !HasLayout(shape) ||
+  if (!ShapeUtil::IsArray(shape) || !HasLayout(shape) ||
       shape.layout().padded_dimensions_size() == 0) {
     return false;
   }
@@ -323,7 +333,8 @@ Layout CreateDefaultLayoutForRank(int64 rank) {
     // Tuple shape: all subshapes must have a layout.
     return std::all_of(shape.tuple_shapes().begin(), shape.tuple_shapes().end(),
                        [](const Shape& s) { return HasLayout(s); });
-  } else if (ShapeUtil::IsOpaque(shape)) {
+  } else if (!ShapeUtil::IsArray(shape)) {
+    // Opaque, token types etc. ignore layout.
     return true;
   }
   return shape.has_layout() && shape.layout().format() != INVALID_FORMAT;
@@ -432,12 +443,9 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
 
 /* static */ bool LayoutUtil::LayoutsInShapesEqual(const Shape& lhs,
                                                    const Shape& rhs) {
-  if (ShapeUtil::IsTuple(lhs) != ShapeUtil::IsTuple(rhs)) {
-    return false;
-  }
   if (ShapeUtil::IsTuple(lhs)) {
-    if (ShapeUtil::TupleElementCount(lhs) !=
-        ShapeUtil::TupleElementCount(rhs)) {
+    if (!ShapeUtil::IsTuple(rhs) || ShapeUtil::TupleElementCount(lhs) !=
+                                        ShapeUtil::TupleElementCount(rhs)) {
       return false;
     }
     for (int i = 0; i < ShapeUtil::TupleElementCount(lhs); ++i) {
@@ -446,9 +454,12 @@ Status LayoutUtil::CopyLayoutBetweenShapes(const Shape& src, Shape* dst) {
       }
     }
     return true;
-  } else {
+  } else if (ShapeUtil::IsArray(lhs)) {
     return ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs) &&
            LayoutUtil::Equal(lhs.layout(), rhs.layout());
+  } else {
+    // Layouts of non-array and non-tuple shapes is ignored.
+    return true;
   }
 }
 
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index 4fd1d818e3..e4c825450d 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -218,6 +218,47 @@ TEST_F(LayoutUtilTest, CopyLayoutBogusLayout) {
                                "elements, but shape is rank"));
 }
 
+TEST_F(LayoutUtilTest, CopyTokenLayout) {
+  Shape src = ShapeUtil::MakeTokenShape();
+  Shape dst = ShapeUtil::MakeTokenShape();
+
+  // Layouts are trivially the same for token types and copying layouts should
+  // be a nop.
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
+TEST_F(LayoutUtilTest, CopyOpaqueLayout) {
+  Shape src = ShapeUtil::MakeOpaqueShape();
+  Shape dst = ShapeUtil::MakeOpaqueShape();
+
+  // Layouts are trivially the same for opaque types and copying layouts should
+  // be a nop.
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
+TEST_F(LayoutUtilTest, CopyTupleLayoutWithTokenAndOpaque) {
+  Shape src = ShapeUtil::MakeTupleShape(
+      {MakeShapeWithLayout(F32, {2, 3}, {0, 1}),
+       MakeShapeWithLayout(F32, {42, 123}, {1, 0}), ShapeUtil::MakeTokenShape(),
+       ShapeUtil::MakeTupleShape(
+           {ShapeUtil::MakeOpaqueShape(), MakeShapeWithLayout(F32, {}, {}),
+            MakeShapeWithLayout(F32, {1, 2, 3}, {0, 2, 1})})});
+  Shape dst = ShapeUtil::MakeTupleShape(
+      {MakeShapeWithLayout(F32, {2, 3}, {1, 0}),
+       MakeShapeWithLayout(F32, {42, 123}, {1, 0}), ShapeUtil::MakeTokenShape(),
+       ShapeUtil::MakeTupleShape(
+           {ShapeUtil::MakeOpaqueShape(), MakeShapeWithLayout(F32, {}, {}),
+            MakeShapeWithLayout(F32, {1, 2, 3}, {1, 2, 0})})});
+
+  EXPECT_FALSE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+  EXPECT_IS_OK(LayoutUtil::CopyLayoutBetweenShapes(src, &dst));
+  EXPECT_TRUE(LayoutUtil::LayoutsInShapesEqual(src, dst));
+}
+
 TEST_F(LayoutUtilTest, ClearLayoutTuple) {
   Shape shape = ShapeUtil::MakeTupleShape(
       {MakeShapeWithLayout(F32, {2, 3}, {1, 0}),
@@ -236,6 +277,16 @@ TEST_F(LayoutUtilTest, ClearLayoutTuple) {
   EXPECT_FALSE(shape.tuple_shapes(2).tuple_shapes(1).has_layout());
 }
 
+TEST_F(LayoutUtilTest, ClearLayoutOpaqueAndToken) {
+  // Opaque and token types trivially have layouts.
+  for (Shape shape :
+       {ShapeUtil::MakeOpaqueShape(), ShapeUtil::MakeTokenShape()}) {
+    EXPECT_TRUE(LayoutUtil::HasLayout(shape));
+    LayoutUtil::ClearLayout(&shape);
+    EXPECT_TRUE(LayoutUtil::HasLayout(shape));
+  }
+}
+
 TEST_F(LayoutUtilTest, SetToDefaultLayoutTuple) {
   Shape shape = ShapeUtil::MakeTupleShape(
       {MakeShapeWithLayout(F32, {2, 3, 4}, {1, 0, 2}),
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index e8a28d76e9..ce4d0079ee 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/iterator_range.h"
@@ -42,17 +41,18 @@ limitations under the License.
 
 namespace xla {
 
+using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
+
 string ShapeIndex::ToString() const {
-  return tensorflow::strings::StrCat(
-      "{", tensorflow::str_util::Join(indices_, ","), "}");
+  return StrCat("{", tensorflow::str_util::Join(indices_, ","), "}");
 }
 
 string ShapeIndexView::ToString() const {
-  return tensorflow::strings::StrCat(
-      "{",
-      tensorflow::str_util::Join(tensorflow::gtl::make_range(begin_, end_),
-                                 ","),
-      "}");
+  return StrCat("{",
+                tensorflow::str_util::Join(
+                    tensorflow::gtl::make_range(begin_, end_), ","),
+                "}");
 }
 
 bool ShapeIndexView::operator==(const ShapeIndexView& other) const {
@@ -84,18 +84,30 @@ std::ostream& operator<<(std::ostream& out, const ShapeIndexView& shape_index) {
 
 namespace {
 
+// Returns whether the given primitive type corresponds to an array shape.
+bool IsArrayPrimitiveType(PrimitiveType primitive_type) {
+  return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
+         primitive_type != OPAQUE && primitive_type != TOKEN;
+}
+
 // Recursive helper for comparing the equality of two shapes. Returns true if
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
 bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
-  if (ShapeUtil::IsTuple(lhs) || ShapeUtil::IsTuple(rhs)) {
-    return ShapeUtil::IsTuple(lhs) && ShapeUtil::IsTuple(rhs) &&
-           ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
+  if (!ShapeUtil::SameElementType(lhs, rhs)) {
+    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
+    return false;
+  }
+
+  if (ShapeUtil::IsTuple(lhs)) {
+    return ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            [=](const Shape& l, const Shape& r) {
                              return CompareShapes(l, r, compare_layouts);
                            });
-  } else if (ShapeUtil::IsOpaque(lhs) || ShapeUtil::IsOpaque(rhs)) {
-    return ShapeUtil::IsOpaque(lhs) && ShapeUtil::IsOpaque(rhs);
+  } else if (!ShapeUtil::IsArray(lhs)) {
+    // Non-tuple, non-array tupes such as opaque and token types are trivially
+    // the same.
+    return true;
   }
 
   if (compare_layouts) {
@@ -125,10 +137,6 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts) {
     VLOG(3) << "CompareShapes: lhs dimensions != rhs dimensions";
     return false;
   }
-  if (!ShapeUtil::SameElementType(lhs, rhs)) {
-    VLOG(3) << "CompareShapes: lhs element type != rhs element type";
-    return false;
-  }
   return true;
 }
 
@@ -171,8 +179,8 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 }
 
 /* static */ int64 ShapeUtil::Rank(const Shape& shape) {
-  CHECK(!ShapeUtil::IsTuple(shape))
-      << "Tuples do not have a rank, shape: " << shape;
+  CHECK(ShapeUtil::IsArray(shape))
+      << "Non-arrays do not have a rank, shape: " << shape;
   return shape.dimensions_size();
 }
 
@@ -199,8 +207,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 
 /* static */ Shape ShapeUtil::MakeShape(
     PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions) {
-  DCHECK_NE(TUPLE, element_type);
-  DCHECK_NE(OPAQUE, element_type);
+  CHECK(IsArrayPrimitiveType(element_type));
   Shape result;
   PopulateShape(element_type, dimensions, &result);
   return result;
@@ -223,8 +230,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 /* static */ Shape ShapeUtil::MakeShapeWithSparseLayout(
     PrimitiveType element_type, tensorflow::gtl::ArraySlice<int64> dimensions,
     int64 max_sparse_elements) {
-  DCHECK_NE(TUPLE, element_type);
-  DCHECK_NE(OPAQUE, element_type);
+  CHECK(IsArrayPrimitiveType(element_type));
   Shape shape = ShapeUtil::MakeShape(element_type, dimensions);
   *shape.mutable_layout() = LayoutUtil::MakeSparseLayout(max_sparse_elements);
   TF_DCHECK_OK(ShapeUtil::ValidateShape(shape));
@@ -271,6 +277,13 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return result;
 }
 
+/* static */ Shape ShapeUtil::MakeTokenShape() {
+  Shape result;
+  result.set_element_type(TOKEN);
+  TF_DCHECK_OK(ValidateShapeWithOptionalLayout(result));
+  return result;
+}
+
 /* static */ void ShapeUtil::AppendShapeToTuple(const Shape& shape,
                                                 Shape* tuple_shape) {
   TF_DCHECK_OK(ValidateShapeWithOptionalLayout(shape));
@@ -294,7 +307,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::ElementHasBitWidth(const Shape& shape, int bits) {
-  if (shape.element_type() == TUPLE || shape.element_type() == OPAQUE) {
+  if (!IsArray(shape)) {
     return false;
   }
   return primitive_util::BitWidth(shape.element_type()) == bits;
@@ -320,6 +333,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
     case C64:
     case TUPLE:
     case OPAQUE:
+    case TOKEN:
       return false;
 
     default:
@@ -335,6 +349,10 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return primitive_util::IsFloatingPointType(shape.element_type());
 }
 
+/* static */ bool ShapeUtil::IsArray(const Shape& shape) {
+  return IsArrayPrimitiveType(shape.element_type());
+}
+
 /* static */ bool ShapeUtil::IsNestedTuple(const Shape& shape) {
   return IsTuple(shape) && std::any_of(shape.tuple_shapes().begin(),
                                        shape.tuple_shapes().end(), IsTuple);
@@ -388,7 +406,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ int64 ShapeUtil::ElementsIn(const Shape& shape) {
-  CHECK(!IsTuple(shape)) << ShapeUtil::HumanString(shape);
+  CHECK(IsArray(shape)) << ShapeUtil::HumanString(shape);
   CHECK_EQ(shape.dimensions_size(), Rank(shape));
   return std::accumulate<decltype(shape.dimensions().begin()), int64>(
       shape.dimensions().begin(), shape.dimensions().end(), 1LL,
@@ -403,23 +421,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return shape.element_type() == F32 && Rank(shape) == 0;
 }
 
-/* static */ string ShapeUtil::HumanString(const Shape& shape) {
-  if (IsTuple(shape)) {
-    string text = "(";
-    const char* prefix = "";
-    for (const Shape& elem_shape : shape.tuple_shapes()) {
-      tensorflow::strings::StrAppend(&text, prefix, HumanString(elem_shape));
-      prefix = ", ";
-    }
-    text += ")";
-    return text;
-  } else {
-    return tensorflow::strings::StrCat(
-        tensorflow::str_util::Lowercase(
-            PrimitiveType_Name(shape.element_type())),
-        "[", tensorflow::str_util::Join(shape.dimensions(), ","), "]");
-  }
-}
 
 namespace {
 
@@ -470,48 +471,56 @@ StatusOr<PrimitiveType> StringToPrimitiveType(const string& name) {
 
 }  // namespace
 
-/* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
+/* static */ string ShapeUtil::HumanString(const Shape& shape) {
   if (IsTuple(shape)) {
     string text = "(";
     const char* prefix = "";
     for (const Shape& elem_shape : shape.tuple_shapes()) {
-      tensorflow::strings::StrAppend(&text, prefix,
-                                     HumanStringWithLayout(elem_shape));
+      StrAppend(&text, prefix, HumanString(elem_shape));
       prefix = ", ";
     }
     text += ")";
     return text;
-  } else {
-    string result = tensorflow::strings::StrCat(
-        LowercasePrimitiveTypeName(shape.element_type()), "[");
-    for (int i = 0; i < shape.dimensions().size(); i++) {
-      tensorflow::strings::StrAppend(&result, (i > 0) ? "," : "",
-                                     shape.dimensions(i));
+  }
+  return StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[",
+                tensorflow::str_util::Join(shape.dimensions(), ","), "]");
+}
+
+/* static */ string ShapeUtil::HumanStringWithLayout(const Shape& shape) {
+  if (IsTuple(shape)) {
+    string text = "(";
+    const char* prefix = "";
+    for (const Shape& elem_shape : shape.tuple_shapes()) {
+      StrAppend(&text, prefix, HumanStringWithLayout(elem_shape));
+      prefix = ", ";
     }
-    result += "]";
-    if (!IsScalar(shape) && !IsOpaque(shape)) {
-      if (LayoutUtil::HasLayout(shape)) {
-        tensorflow::strings::StrAppend(&result,
-                                       LayoutUtil::HumanString(shape.layout()));
-      }
+    text += ")";
+    return text;
+  }
+  string result = StrCat(LowercasePrimitiveTypeName(shape.element_type()), "[");
+  for (int i = 0; i < shape.dimensions().size(); i++) {
+    StrAppend(&result, (i > 0) ? "," : "", shape.dimensions(i));
+  }
+  result += "]";
+  if (!IsScalar(shape) && IsArray(shape)) {
+    if (LayoutUtil::HasLayout(shape)) {
+      StrAppend(&result, LayoutUtil::HumanString(shape.layout()));
     }
-    return result;
   }
+  return result;
 }
 
 /* static */ string ShapeUtil::HumanString(const ProgramShape& program_shape) {
   std::vector<string> parameters;
   for (auto& shape : program_shape.parameters()) {
     const int i = parameters.size();
-    parameters.push_back(
-        tensorflow::strings::StrCat(i < program_shape.parameter_names_size()
-                                        ? program_shape.parameter_names(i)
-                                        : "(unknown)",
-                                    ": ", HumanString(shape)));
+    parameters.push_back(StrCat(i < program_shape.parameter_names_size()
+                                    ? program_shape.parameter_names(i)
+                                    : "(unknown)",
+                                ": ", HumanString(shape)));
   }
-  return tensorflow::strings::StrCat(
-      "(", tensorflow::str_util::Join(parameters, ", "), ") -> ",
-      HumanString(program_shape.result()));
+  return StrCat("(", tensorflow::str_util::Join(parameters, ", "), ") -> ",
+                HumanString(program_shape.result()));
 }
 
 namespace {
@@ -581,14 +590,17 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
     // Extract the primitive element type.
     TF_ASSIGN_OR_RETURN(const PrimitiveType primitive_type,
                         StringToPrimitiveType(element_type_string));
-    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE ||
-        primitive_type == OPAQUE) {
+    if (primitive_type == PRIMITIVE_TYPE_INVALID || primitive_type == TUPLE) {
       return InvalidArgument("Invalid element type string: \"%s\".",
                              element_type_string.c_str());
     }
 
     Shape result;
-    if (format_string.empty() && layout_string.empty()) {
+    if (primitive_type == OPAQUE) {
+      result = ShapeUtil::MakeOpaqueShape();
+    } else if (primitive_type == TOKEN) {
+      result = ShapeUtil::MakeTokenShape();
+    } else if (format_string.empty() && layout_string.empty()) {
       // Create a shape without a layout set.
       result = ShapeUtil::MakeShape(primitive_type, dimensions);
     } else if (format_string == "sparse") {
@@ -633,43 +645,44 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 }
 
 /* static */ bool ShapeUtil::Compatible(const Shape& lhs, const Shape& rhs) {
-  if (lhs.element_type() == TUPLE) {
+  if (IsArray(lhs)) {
+    return SameElementType(lhs, rhs) && SameDimensions(lhs, rhs);
+  } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(), Compatible);
+  } else {
+    // Opaque, token, etc types are vacuously compatible.
+    return true;
   }
-  if (lhs.element_type() == OPAQUE) {
-    return rhs.element_type() == OPAQUE;
-  }
-  return SameElementType(lhs, rhs) && SameDimensions(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringElementType(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (lhs.element_type() == TUPLE) {
+  if (IsArray(lhs)) {
+    return IsArray(rhs) && SameDimensions(lhs, rhs);
+  } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            CompatibleIgnoringElementType);
+  } else {
+    // Opaque, token, etc types are vacuously compatible.
+    return true;
   }
-  if (lhs.element_type() == OPAQUE) {
-    return rhs.element_type() == OPAQUE;
-  }
-  return ShapeUtil::IsArray(rhs) && SameDimensions(lhs, rhs);
 }
 
 /* static */ bool ShapeUtil::CompatibleIgnoringFpPrecision(const Shape& lhs,
                                                            const Shape& rhs) {
-  if (lhs.element_type() == TUPLE) {
+  if (IsArray(lhs)) {
+    return IsArray(rhs) && SameElementTypeIgnoringFpPrecision(lhs, rhs) &&
+           CompatibleIgnoringElementType(lhs, rhs);
+  } else if (lhs.element_type() == TUPLE) {
     return rhs.element_type() == TUPLE &&
            ContainersEqual(lhs.tuple_shapes(), rhs.tuple_shapes(),
                            CompatibleIgnoringFpPrecision);
+  } else {
+    // Opaque, token, etc types are vacuously compatible.
+    return true;
   }
-  if (lhs.element_type() == OPAQUE) {
-    return rhs.element_type() == OPAQUE;
-  }
-  if (SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
-    return CompatibleIgnoringElementType(lhs, rhs);
-  }
-  return false;
 }
 
 /* static */ int64 ShapeUtil::GetDimension(const Shape& shape,
@@ -691,10 +704,6 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   switch (primitive_type) {
     case PRED:
       return sizeof(int8);
-    case TUPLE:
-      LOG(FATAL) << "tuples have no definitive size";
-    case OPAQUE:
-      LOG(FATAL) << "opaque have no definitive size";
     case S8:
       return sizeof(int8);
     case S16:
@@ -721,6 +730,13 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
       return sizeof(double);
     case C64:
       return sizeof(complex64);
+    case TOKEN:
+      // Tokens require no space.
+      return 0;
+    case TUPLE:
+    case OPAQUE:
+      LOG(FATAL) << PrimitiveType_Name(primitive_type)
+                 << " primitive type has no definitive size";
     default:
       LOG(FATAL) << "Unhandled primitive type " << primitive_type;
   }
@@ -729,28 +745,32 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 /* static */ int64 ShapeUtil::ByteSizeOf(const Shape& shape,
                                          int64 pointer_size) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK_NE(OPAQUE, shape.element_type());
   if (shape.element_type() == TUPLE) {
     return ByteSizeOfTupleIndexTable(shape, pointer_size);
+  } else if (IsArray(shape)) {
+    int64 byte_size = ByteSizeOfElements(shape);
+    if (LayoutUtil::IsSparseArray(shape)) {
+      byte_size += ByteSizeOfSparseIndices(shape);
+    }
+    return byte_size;
+  } else if (shape.element_type() == TOKEN) {
+    return 0;
   }
-  int64 byte_size = ByteSizeOfElements(shape);
-  if (LayoutUtil::IsSparseArray(shape)) {
-    byte_size += ByteSizeOfSparseIndices(shape);
-  }
-  return byte_size;
+  LOG(FATAL) << PrimitiveType_Name(shape.element_type())
+             << " primitive type has no definitive size";
 }
 
 /* static */ int64 ShapeUtil::ByteSizeOfTupleIndexTable(const Shape& shape,
                                                         int64 pointer_size) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK_EQ(TUPLE, shape.element_type());
+  CHECK_EQ(TUPLE, shape.element_type());
   CHECK_GT(pointer_size, 0);
   return pointer_size * shape.tuple_shapes_size();
 }
 
 /* static */ int64 ShapeUtil::ByteSizeOfElements(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK(ShapeUtil::IsArray(shape));
+  CHECK(ShapeUtil::IsArray(shape));
   int64 allocated_element_count;
 
   if (LayoutUtil::IsSparseArray(shape)) {
@@ -775,13 +795,17 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
 
 /* static */ int64 ShapeUtil::ByteSizeOfSparseIndices(const Shape& shape) {
   TF_DCHECK_OK(ValidateShape(shape));
-  DCHECK(LayoutUtil::IsSparseArray(shape));
+  CHECK(LayoutUtil::IsSparseArray(shape));
   return LayoutUtil::MaxSparseElements(shape.layout()) *
          ShapeUtil::Rank(shape) * sizeof(int64);
 }
 
 /* static */ Status ShapeUtil::ValidateShapeWithOptionalLayoutInternal(
     const Shape& shape) {
+  if (shape.element_type() == PRIMITIVE_TYPE_INVALID) {
+    return InvalidArgument("shape has invalid element type: %s",
+                           shape.ShortDebugString().c_str());
+  }
   if (shape.element_type() == TUPLE) {
     if (shape.dimensions_size() != 0) {
       return InvalidArgument("tuples must not have dimensions specified");
@@ -797,10 +821,24 @@ StatusOr<Shape> ParseShapeStringInternal(tensorflow::StringPiece* s) {
   if (shape.tuple_shapes_size() > 0) {
     return InvalidArgument("non-tuple shape has tuple_shapes field");
   }
-  if (shape.element_type() == PRIMITIVE_TYPE_INVALID) {
-    return InvalidArgument("shape has invalid element type: %s",
-                           shape.ShortDebugString().c_str());
+
+  // Tokens and opaques can should not have layout or dimensions.
+  if (shape.element_type() == TOKEN || shape.element_type() == OPAQUE) {
+    if (shape.dimensions_size() != 0) {
+      return InvalidArgument(
+          "shape has %s element type, but has dimensions field: %s",
+          LowercasePrimitiveTypeName(shape.element_type()).c_str(),
+          shape.ShortDebugString().c_str());
+    }
+    if (shape.has_layout()) {
+      return InvalidArgument(
+          "shape has %s element type, but has layout field: %s",
+          LowercasePrimitiveTypeName(shape.element_type()).c_str(),
+          shape.ShortDebugString().c_str());
+    }
+    return Status::OK();
   }
+
   if (Rank(shape) != shape.dimensions_size()) {
     return InvalidArgument(
         "shape's rank is mismatched with dimension count; rank=%lld "
@@ -902,6 +940,8 @@ bool ShapeUtil::IsLeafIndex(const Shape& shape, const ShapeIndex& index) {
 }
 
 /* static */ Shape ShapeUtil::StripDegenerateDimensions(const Shape& shape) {
+  CHECK(IsArray(shape));
+
   std::vector<int64> dimension_sizes;
   std::vector<int64> degenerate_dimensions;
   for (int64 i = 0; i < shape.dimensions_size(); ++i) {
@@ -1066,6 +1106,9 @@ Status ForEachMutableSubshapeHelper(
 /* static */ std::tuple<bool, std::vector<int64>, std::vector<int64>>
 ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
                                              const Shape& shape_post) {
+  CHECK(IsArray(shape_pre));
+  CHECK(IsArray(shape_post));
+
   auto nil = std::make_tuple(false, std::vector<int64>(), std::vector<int64>());
 
   std::vector<int64> deleted_indices;
@@ -1123,6 +1166,9 @@ ShapeUtil::InsertedOrDeleted1SizedDimensions(const Shape& shape_pre,
 /* static */ std::vector<std::pair<int64, int64>>
 ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
                                          const Shape& output_shape) {
+  CHECK(IsArray(input_shape));
+  CHECK(IsArray(output_shape));
+
   // Unmodified dimensions are merely common factors of rank 1.
   auto common_factors = CommonFactors(AsInt64Slice(input_shape.dimensions()),
                                       AsInt64Slice(output_shape.dimensions()));
@@ -1176,8 +1222,10 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ bool ShapeUtil::ReshapeIsBitcast(const Shape& input_shape,
                                               const Shape& output_shape) {
-  CHECK(LayoutUtil::HasLayout(input_shape) &&
-        LayoutUtil::HasLayout(output_shape));
+  CHECK(IsArray(input_shape));
+  CHECK(IsArray(output_shape));
+  CHECK(LayoutUtil::HasLayout(input_shape));
+  CHECK(LayoutUtil::HasLayout(output_shape));
 
   if (!SameElementType(input_shape, output_shape)) {
     return false;
@@ -1339,6 +1387,9 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ tensorflow::gtl::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
+  CHECK(IsArray(input_shape));
+  CHECK(IsArray(output_shape));
+
   int64 input_rank = Rank(input_shape);
   int64 output_rank = Rank(output_shape);
 
@@ -1473,6 +1524,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
+  CHECK(IsArray(shape));
   shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
@@ -1494,6 +1546,7 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 
 /* static */ Shape ShapeUtil::FilterDimensions(
     const std::function<bool(int64)>& p, Shape shape) {
+  CHECK(IsArray(shape));
   std::vector<int64> dims_to_delete;
   for (int64 i = shape.dimensions().size() - 1; i >= 0; --i) {
     if (!p(i)) {
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index 9df31d5d21..3853ada6ba 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -169,7 +169,7 @@ class ShapeUtil {
   // may not actually be able to store this number of elements. See
   // LayoutUtil::MaxSparseElements(shape) to obtain the maximum number of
   // elements that can be stored in a sparse shape.
-  // Precondition: !IsTuple(shape)
+  // Precondition: IsArray(shape)
   static int64 ElementsIn(const Shape& shape);
 
   // Returns true if 'shape' has zero elements.
@@ -180,13 +180,11 @@ class ShapeUtil {
   // shapes. This includes only the size of the top-level buffer. For example, a
   // tuple is stored as an array of pointers to other buffers. In this case,
   // this method only returns the size of the pointer array.
-  // Precondition: (!ShapeUtil::IsTuple(shape) || pointer_size > 0) &&
-  //               !ShapeUtil::IsOpaque(shape)
   static int64 ByteSizeOf(const Shape& shape, int64 pointer_size = -1);
 
   // Returns the number of bytes used to store the primitive_type.
   //
-  // Precondition: !ShapeUtil::IsOpaque(shape) && !ShapeUtil::IsTuple(shape)
+  // Precondition: ShapeUtil::IsArray(shape)
   static int64 ByteSizeOfPrimitiveType(PrimitiveType primitive_type);
 
   // Returns the number of bytes required to store the tuple member pointers for
@@ -245,7 +243,7 @@ class ShapeUtil {
   }
 
   // Returns the higher-precision element type if a and b are both floating
-  // point types; otherwise, checks that they have the same element type
+  // point types; otherwise, checks that that they have the same element type
   // and returns it.
   static PrimitiveType HigherPrecisionElementType(const Shape& a,
                                                   const Shape& b) {
@@ -293,10 +291,10 @@ class ShapeUtil {
   // Scalar-specific
 
   static bool IsScalar(const Shape& shape) {
-    return !IsTuple(shape) && !IsOpaque(shape) && Rank(shape) == 0;
+    return IsArray(shape) && Rank(shape) == 0;
   }
   static bool IsEffectiveScalar(const Shape& shape) {
-    return !IsTuple(shape) && !IsOpaque(shape) && TrueRank(shape) == 0;
+    return IsArray(shape) && TrueRank(shape) == 0;
   }
   static bool IsScalarF32(const Shape& shape);
 
@@ -325,6 +323,10 @@ class ShapeUtil {
   // into a custom operation.
   static Shape MakeOpaqueShape();
 
+  // Creates a token shape. Values of this shape are used for ordering
+  // side-effecting operations.
+  static Shape MakeTokenShape();
+
   // Appends a shape to the given tuple.
   static void AppendShapeToTuple(const Shape& shape, Shape* tuple_shape);
 
@@ -424,11 +426,15 @@ class ShapeUtil {
     return shape.element_type() == OPAQUE;
   }
 
+  // Returns whether the shape is an token value used for ordering
+  // side-effecting operations.
+  static bool IsToken(const Shape& shape) {
+    return shape.element_type() == TOKEN;
+  }
+
   // Returns whether the shape is an array.  Note that scalars are considered
   // arrays.
-  static bool IsArray(const Shape& shape) {
-    return !IsTuple(shape) && !IsOpaque(shape);
-  }
+  static bool IsArray(const Shape& shape);
 
   // Returns whether the shape is a tuple with at least one element which is
   // also a tuple.
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index f7675e97da..ecdb6532f1 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -93,12 +93,14 @@ TEST(ShapeUtilTest, ParseShapeStringTupleOfArrays) {
 }
 
 TEST(ShapeUtilTest, ParseShapeStringNestedTuple) {
-  string shape_string = "(f32[1],(f32[2]), f32[3])";
+  string shape_string = "(f32[1],(f32[2], token[]), opaque[], f32[3])";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual,
                           ShapeUtil::ParseShapeString(shape_string));
   Shape expected = ShapeUtil::MakeTupleShape({
       ShapeUtil::MakeShape(F32, {1}),
-      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2})}),
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {2}), ShapeUtil::MakeTokenShape()}),
+      ShapeUtil::MakeOpaqueShape(),
       ShapeUtil::MakeShape(F32, {3}),
   });
   ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
@@ -136,6 +138,23 @@ TEST(ShapeUtilTest, ParseShapeStringWithSparseLayout) {
       << "actual: " << ShapeUtil::HumanString(actual);
 }
 
+TEST(ShapeUtilTest, ParseOpaqueType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual,
+                          ShapeUtil::ParseShapeString("opaque[]"));
+  Shape expected = ShapeUtil::MakeOpaqueShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
+TEST(ShapeUtilTest, ParseTokenType) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ShapeUtil::ParseShapeString("token[]"));
+  Shape expected = ShapeUtil::MakeTokenShape();
+  ASSERT_TRUE(ShapeUtil::Equal(expected, actual))
+      << "expected: " << ShapeUtil::HumanString(expected)
+      << "actual:   " << ShapeUtil::HumanString(actual);
+}
+
 TEST(ShapeUtilTest, ParseInvalidShapeString) {
   string shape_strings[] = {
       "f32[123,456]foobar{0,1}", "f32[123,456]sparse{0,1}", "f32[123,456]{foo}",
@@ -295,6 +314,9 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
   EXPECT_EQ(8, ShapeUtil::ByteSizeOfPrimitiveType(C64));
   EXPECT_EQ(8, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {})));
   EXPECT_EQ(1600, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {10, 20})));
+
+  EXPECT_EQ(0, ShapeUtil::ByteSizeOfPrimitiveType(TOKEN));
+  EXPECT_EQ(0, ShapeUtil::ByteSizeOf(ShapeUtil::MakeTokenShape()));
 }
 
 TEST(ShapeUtilTest, ByteSizeOfWithPadding) {
@@ -449,19 +471,21 @@ TEST(ShapeUtilTest, IsLeafIndex) {
 
 TEST(ShapeUtilTest, HumanString) {
   Shape opaque = ShapeUtil::MakeOpaqueShape();
+  Shape token = ShapeUtil::MakeTokenShape();
   Shape scalar = ShapeUtil::MakeShape(F32, {});
   Shape matrix = ShapeUtil::MakeShape(U32, {1, 2});
   Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
   Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2});
-  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix});
+  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token});
 
   EXPECT_EQ("opaque[]", ShapeUtil::HumanString(opaque));
+  EXPECT_EQ("token[]", ShapeUtil::HumanString(token));
   EXPECT_EQ("f32[]", ShapeUtil::HumanString(scalar));
   EXPECT_EQ("u32[1,2]", ShapeUtil::HumanString(matrix));
   EXPECT_EQ("s32[3,4]", ShapeUtil::HumanString(matrix2));
   EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])",
             ShapeUtil::HumanString(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])",
+  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
             ShapeUtil::HumanString(nested_tuple));
 
   EXPECT_EQ("opaque[]", ShapeUtil::HumanStringWithLayout(opaque));
@@ -470,8 +494,10 @@ TEST(ShapeUtilTest, HumanString) {
   EXPECT_EQ("s32[3,4]{0,1}", ShapeUtil::HumanStringWithLayout(matrix2));
   EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
             ShapeUtil::HumanStringWithLayout(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0})",
-            ShapeUtil::HumanStringWithLayout(nested_tuple));
+  EXPECT_EQ(
+      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
+      "token[])",
+      ShapeUtil::HumanStringWithLayout(nested_tuple));
 
   ProgramShape prog = ShapeUtil::MakeProgramShape(
       {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple);
@@ -481,8 +507,9 @@ TEST(ShapeUtilTest, HumanString) {
       "(unknown): u32[1,2], "
       "(unknown): s32[3,4], "
       "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])) -> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])",
+      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
       ShapeUtil::HumanString(prog));
 
   prog.add_parameter_names("arg0");
@@ -497,8 +524,10 @@ TEST(ShapeUtilTest, HumanString) {
       "matrix: u32[1,2], "
       "matrix2: s32[3,4], "
       "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])) -> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2])",
+      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
+      "token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
       ShapeUtil::HumanString(prog));
 }
 
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index b895ac045c..6bdfb0179c 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -66,11 +66,16 @@ enum PrimitiveType {
   // in the dimensions field.
   TUPLE = 13;
 
-  // An opaque type used for passing context specific data to a custom
-  // operation.
+  // An opaque type used for passing context-specific data to a custom
+  // operation. Shapes of this primitive type will have empty dimensions and
+  // tuple_shapes fields.
   OPAQUE = 14;
 
-  // Next = 17
+  // A token type threaded between side-effecting operations. Shapes of this
+  // primitive type will have empty dimensions and tuple_shapes fields.
+  TOKEN = 17;
+
+  // Next = 18
 }
 
 // Describes the value held inside padding elements.
-- 
GitLab


From 7d195d0d4936cbf289d2d5c590f82471ee8259ad Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 16:43:33 -0700
Subject: [PATCH 284/610] Fix an floating point inaccuracy issue in
 precision_recall_at_equal_thresholds due to accumulating the tp/fp/tn/fn
 values in float32, which can become highly inaccurate as the number of values
 increases.

In the common case, the method sums the value 1.0f to the tp/fp/tn/fn bucket for every
value in the predictions tensor.  If the tensor is large (say, it represents an image
and we have one tp/fp/tn/fn value per pixel), then we are essentially adding many 1.0f's
together, across the entire batch and also across all the batches.  By doing it in
float32 the value starts becoming inaccurate at around 16M, which is very small.

In practice, we see a deviation of 100x when the total reaches about 3e10 (the previous
code reports a number about 1e8 when the actual value should be 3e10).

We avoid all these issues by always accumulating in float64.

Also fix a bug that the method cannot be called with predictions dtype being anything
other than float32.  Preivously it would crash due to the eps code near the end.
Added tests for using float64 and float16.

PiperOrigin-RevId: 199216173
---
 .../contrib/metrics/python/ops/metric_ops.py  |  39 +++--
 .../metrics/python/ops/metric_ops_test.py     | 137 ++++++++++++++----
 2 files changed, 130 insertions(+), 46 deletions(-)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index 00a933e5e0..2ed99d50a4 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -1544,7 +1544,7 @@ def precision_recall_at_equal_thresholds(labels,
     result: A named tuple (See PrecisionRecallData within the implementation of
       this function) with properties that are variables of shape
       `[num_thresholds]`. The names of the properties are tp, fp, tn, fn,
-      precision, recall, thresholds.
+      precision, recall, thresholds. Types are same as that of predictions.
     update_op: An op that accumulates values.
 
   Raises:
@@ -1570,7 +1570,6 @@ def precision_recall_at_equal_thresholds(labels,
 
   check_ops.assert_type(labels, dtypes.bool)
 
-  dtype = predictions.dtype
   with variable_scope.variable_scope(name,
                                      'precision_recall_at_equal_thresholds',
                                      (labels, predictions, weights)):
@@ -1592,11 +1591,16 @@ def precision_recall_at_equal_thresholds(labels,
 
     predictions.get_shape().assert_is_compatible_with(labels.get_shape())
 
-    # We cast to float to ensure we have 0.0 or 1.0.
-    f_labels = math_ops.cast(labels, dtype)
+    # It's important we aggregate using float64 since we're accumulating a lot
+    # of 1.0's for the true/false labels, and accumulating to float32 will
+    # be quite inaccurate even with just a modest amount of values (~20M).
+    # We use float64 instead of integer primarily since GPU scatter kernel
+    # only support floats.
+    agg_dtype = dtypes.float64
 
-    # Get weighted true/false labels.
-    true_labels = f_labels * weights
+    f_labels = math_ops.cast(labels, agg_dtype)
+    weights = math_ops.cast(weights, agg_dtype)
+    true_labels = f_labels  * weights
     false_labels = (1.0 - f_labels) * weights
 
     # Flatten predictions and labels.
@@ -1638,9 +1642,9 @@ def precision_recall_at_equal_thresholds(labels,
 
     with ops.name_scope('variables'):
       tp_buckets_v = metrics_impl.metric_variable(
-          [num_thresholds], dtype, name='tp_buckets')
+          [num_thresholds], agg_dtype, name='tp_buckets')
       fp_buckets_v = metrics_impl.metric_variable(
-          [num_thresholds], dtype, name='fp_buckets')
+          [num_thresholds], agg_dtype, name='fp_buckets')
 
     with ops.name_scope('update_op'):
       update_tp = state_ops.scatter_add(
@@ -1660,18 +1664,21 @@ def precision_recall_at_equal_thresholds(labels,
     fn = tp[0] - tp
 
     # We use a minimum to prevent division by 0.
-    epsilon = 1e-7
+    epsilon = ops.convert_to_tensor(1e-7, dtype=agg_dtype)
     precision = tp / math_ops.maximum(epsilon, tp + fp)
     recall = tp / math_ops.maximum(epsilon, tp + fn)
 
+    # Convert all tensors back to predictions' dtype (as per function contract).
+    out_dtype = predictions.dtype
+    _convert = lambda tensor: math_ops.cast(tensor, out_dtype)
     result = PrecisionRecallData(
-        tp=tp,
-        fp=fp,
-        tn=tn,
-        fn=fn,
-        precision=precision,
-        recall=recall,
-        thresholds=math_ops.lin_space(0.0, 1.0, num_thresholds))
+        tp=_convert(tp),
+        fp=_convert(fp),
+        tn=_convert(tn),
+        fn=_convert(fn),
+        precision=_convert(precision),
+        recall=_convert(recall),
+        thresholds=_convert(math_ops.lin_space(0.0, 1.0, num_thresholds)))
     update_op = control_flow_ops.group(update_tp, update_fp)
     return result, update_op
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index e6f75fcbd7..4ccba4a253 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2333,47 +2333,24 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
-  def _testResultsEqual(self, expected_dict, gotten_result):
+  def _testResultsEqual(self, expected_dict, gotten_result, eps=None):
     """Tests that 2 results (dicts) represent the same data.
 
     Args:
       expected_dict: A dictionary with keys that are the names of properties
         of PrecisionRecallData and whose values are lists of floats.
       gotten_result: A PrecisionRecallData object.
+      eps: Epsilon value to use for testing output values. If unspecified, use
+        default from assertAllClose.
     """
     gotten_dict = {k: t.eval() for k, t in gotten_result._asdict().items()}
     self.assertItemsEqual(list(expected_dict.keys()), list(gotten_dict.keys()))
 
     for key, expected_values in expected_dict.items():
-      self.assertAllClose(expected_values, gotten_dict[key])
-
-  def _testCase(self, predictions, labels, expected_result, weights=None):
-    """Performs a test given a certain scenario of labels, predictions, weights.
-
-    Args:
-      predictions: The predictions tensor. Of type float32.
-      labels: The labels tensor. Of type bool.
-      expected_result: The expected result (dict) that maps to tensors.
-      weights: Optional weights tensor.
-    """
-    with self.test_session() as sess:
-      predictions_tensor = constant_op.constant(
-          predictions, dtype=dtypes_lib.float32)
-      labels_tensor = constant_op.constant(labels, dtype=dtypes_lib.bool)
-      weights_tensor = None
-      if weights:
-        weights_tensor = constant_op.constant(weights, dtype=dtypes_lib.float32)
-      gotten_result, update_op = (
-          metric_ops.precision_recall_at_equal_thresholds(
-              labels=labels_tensor,
-              predictions=predictions_tensor,
-              weights=weights_tensor,
-              num_thresholds=3))
-
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-
-      self._testResultsEqual(expected_result, gotten_result)
+      if eps is not None:
+        self.assertAllClose(expected_values, gotten_dict[key], atol=eps)
+      else:
+        self.assertAllClose(expected_values, gotten_dict[key])
 
   def testVars(self):
     metric_ops.precision_recall_at_equal_thresholds(
@@ -2414,6 +2391,77 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
       for _ in range(3):
         self._testResultsEqual(initial_result, result)
 
+  def testLargeCase(self):
+    shape = [32, 512, 256, 1]
+    predictions = random_ops.random_uniform(
+        shape, 0.0, 1.0, dtype=dtypes_lib.float32)
+    labels = math_ops.greater(random_ops.random_uniform(shape, 0.0, 1.0), 0.5)
+
+    result, update_op = metric_ops.precision_recall_at_equal_thresholds(
+        labels=labels, predictions=predictions, num_thresholds=201)
+    # Run many updates, enough to cause highly inaccurate values if the
+    # code used float32 for accumulation.
+    num_updates = 71
+
+    with self.test_session() as sess:
+      sess.run(variables.local_variables_initializer())
+      for _ in xrange(num_updates):
+        sess.run(update_op)
+
+      prdata = sess.run(result)
+
+      # Since we use random values, we won't know the tp/fp/tn/fn values, but
+      # tp and fp at threshold 0 should be the total number of positive and
+      # negative labels, hence their sum should be total number of pixels.
+      expected_value = 1.0 * np.product(shape) * num_updates
+      got_value = prdata.tp[0] + prdata.fp[0]
+      # They should be at least within 1.
+      self.assertNear(got_value, expected_value, 1.0)
+
+  def _testCase(self,
+                predictions,
+                labels,
+                expected_result,
+                dtype=dtypes_lib.float32,
+                eps=None,
+                weights=None):
+    """Performs a test given a certain scenario of labels, predictions, weights.
+
+    Args:
+      predictions: The predictions tensor. Of type dtype.
+      labels: The labels tensor. Of type bool.
+      expected_result: The expected result (dict) that maps to tensors.
+      dtype: Data type to use for predictions and weights tensor. Default
+        is float32.
+      eps: Epsilon value to use for testing output values. If unspecified, use
+        default from assertAllClose.
+      weights: Optional weights tensor.
+    """
+    with self.test_session() as sess:
+      predictions_tensor = constant_op.constant(predictions, dtype=dtype)
+      labels_tensor = constant_op.constant(labels, dtype=dtypes_lib.bool)
+      weights_tensor = None
+      if weights:
+        weights_tensor = constant_op.constant(weights, dtype=dtype)
+      gotten_result, update_op = (
+          metric_ops.precision_recall_at_equal_thresholds(
+              labels=labels_tensor,
+              predictions=predictions_tensor,
+              weights=weights_tensor,
+              num_thresholds=3))
+      self.assertEqual(gotten_result.tp.dtype, dtype)
+      self.assertEqual(gotten_result.fp.dtype, dtype)
+      self.assertEqual(gotten_result.tn.dtype, dtype)
+      self.assertEqual(gotten_result.fn.dtype, dtype)
+      self.assertEqual(gotten_result.precision.dtype, dtype)
+      self.assertEqual(gotten_result.recall.dtype, dtype)
+      self.assertEqual(gotten_result.thresholds.dtype, dtype)
+
+      sess.run(variables.local_variables_initializer())
+      sess.run(update_op)
+
+      self._testResultsEqual(expected_result, gotten_result, eps=eps)
+
   def testAllTruePositives(self):
     self._testCase(
         [[1]], [[True]], {
@@ -2489,6 +2537,35 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
         },
         weights=[[0.0, 0.5, 2.0, 0.0, 0.5, 1.0]])
 
+  def testFloat64(self):
+    self._testCase(
+        [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
+        [[True, False, False, True, True, True]], {
+            'tp': [4, 3, 0],
+            'fp': [2, 0, 0],
+            'tn': [0, 2, 2],
+            'fn': [0, 1, 4],
+            'precision': [2.0 / 3.0, 1.0, 0.0],
+            'recall': [1.0, 0.75, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        },
+        dtype=dtypes_lib.float64)
+
+  def testFloat16(self):
+    self._testCase(
+        [[0.2, 0.3, 0.4, 0.6, 0.7, 0.8]],
+        [[True, False, False, True, True, True]], {
+            'tp': [4, 3, 0],
+            'fp': [2, 0, 0],
+            'tn': [0, 2, 2],
+            'fn': [0, 1, 4],
+            'precision': [2.0 / 3.0, 1.0, 0.0],
+            'recall': [1.0, 0.75, 0.0],
+            'thresholds': [0.0, 0.5, 1.0],
+        },
+        dtype=dtypes_lib.float16,
+        eps=1e-3)
+
 
 class StreamingSpecificityAtSensitivityTest(test.TestCase):
 
-- 
GitLab


From ff5ad20576e2c2a5c2295365c396da367428c753 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 16:46:57 -0700
Subject: [PATCH 285/610] Updated include path for internal protobuf
 implementation.

PiperOrigin-RevId: 199216721
---
 tensorflow/contrib/lite/toco/tooling_util.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/tooling_util.h b/tensorflow/contrib/lite/toco/tooling_util.h
index 1f596ca8e5..3b320e8013 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.h
+++ b/tensorflow/contrib/lite/toco/tooling_util.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/logging.h"
 #if TOCO_SUPPORT_PORTABLE_PROTOS
-#include "third_party/protobuf/src/google/protobuf/text_format.h"
+#include "third_party/protobuf/include/google/protobuf/text_format.h"
 #endif  // TOCO_SUPPORT_PORTABLE_PROTOS
 #include "tensorflow/contrib/lite/toco/model.h"
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
-- 
GitLab


From 640cb59e94248c55934fe4e2b59fb3e18957b297 Mon Sep 17 00:00:00 2001
From: vchigrin <vyacheslav.chigrin@yandex.ru>
Date: Tue, 5 Jun 2018 02:50:09 +0300
Subject: [PATCH 286/610] Periodic resample operation gradients and
 optimization (#16520)

* Implement gradient of periodic resample operation.

* Set fully defined output shape for periodic_resample when possible.

* Speed up periodic_resample operation.

Use incremental updates in index computation where possible.

* Allow periodic_resample run on multiple CPU kernels.

* Small refactoring.

* Add test for periodic_resample shape inference.

* Fix issues after review.

* Add shape inference C++ test.

* Code style fix
---
 tensorflow/contrib/periodic_resample/BUILD    |  17 +-
 .../kernels/periodic_resample_op.cc           |   5 +
 .../kernels/periodic_resample_op.h            | 415 +++++++++++++-----
 .../periodic_resample/ops/array_ops.cc        |  53 ++-
 .../periodic_resample/ops/array_ops_test.cc   |  40 ++
 .../kernel_tests/periodic_resample_op_test.py |  27 +-
 .../python/ops/periodic_resample_op.py        |   8 +-
 7 files changed, 445 insertions(+), 120 deletions(-)
 create mode 100644 tensorflow/contrib/periodic_resample/ops/array_ops_test.cc

diff --git a/tensorflow/contrib/periodic_resample/BUILD b/tensorflow/contrib/periodic_resample/BUILD
index 6ca7fe8b6e..976b312e83 100644
--- a/tensorflow/contrib/periodic_resample/BUILD
+++ b/tensorflow/contrib/periodic_resample/BUILD
@@ -6,12 +6,13 @@ exports_files(["LICENSE"])
 
 load(
     "//tensorflow:tensorflow.bzl",
-    "py_test",
+    "tf_cc_test",
     "tf_gen_op_libs",
     "tf_custom_op_library",
     "tf_custom_op_py_library",
     "tf_gen_op_wrapper_py",
 )
+load("//tensorflow:tensorflow.bzl", "py_test")
 
 cc_library(
     name = "all_ops",
@@ -84,6 +85,20 @@ py_test(
         ":init_py",
         "//tensorflow/contrib/util:util_py",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradient_checker",
+    ],
+)
+
+tf_cc_test(
+    name = "periodic_resample_op_cc_test",
+    size = "small",
+    srcs = [
+        "ops/array_ops_test.cc",
+    ],
+    deps = [
+        ":all_ops",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
     ],
 )
 
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
index e18923c8aa..514689cf45 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.cc
@@ -22,4 +22,9 @@ namespace tensorflow {
 REGISTER_KERNEL_BUILDER(Name("PeriodicResample").Device(DEVICE_CPU),
                         PeriodicResampleOp);
 
+
+REGISTER_KERNEL_BUILDER(Name("PeriodicResampleOpGrad")
+                            .Device(DEVICE_CPU),
+                        PeriodicResampleOpGrad);
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
index 3ab588c458..42fba81a5c 100644
--- a/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
+++ b/tensorflow/contrib/periodic_resample/kernels/periodic_resample_op.h
@@ -25,92 +25,202 @@
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/util/work_sharder.h"
 
 namespace {
 
-template <class IndexVecT, class IndexT>
-IndexT compute_input_index(
-    IndexVecT* target_dimensions, const IndexT& output_index,
-    const IndexVecT& original_dimensions, const int& adjustable_dimension,
-    const std::vector<tensorflow::int64>& dimension_ceiling,
-    const std::vector<tensorflow::int64>& cumulative_dimensions, IndexT* result,
-    std::vector<IndexT>* output_indices, const int& rank) {
-  *result = 0;
-  output_indices->clear();
+// Computes input tensor index for given output index during forward
+// propagation through periodic_resample operation.
+class InputIndexer {
+ public:
+  InputIndexer(const std::vector<tensorflow::int64>& output_dimensions,
+               const tensorflow::TensorShape& input_shape,
+               int adjustable_dimension)
+      : output_dimensions_(output_dimensions),
+        adjustable_dimension_(adjustable_dimension),
+        rank_(input_shape.dims()),
+        linear_output_index_(0),
+        linear_input_index_(0),
+        adjustable_dimension_carriage_sum_(0) {
+    auto input_dimensions = TensorShapeToVector(input_shape);
+    // factors by which input_dimensions increases/decreases w.r.t.
+    // output_dimensions
+    dimension_ceiling_ =
+        ComputeDimensionCeiling(output_dimensions, input_dimensions);
+    cumulative_dimensions_ = ComputeCumulativeDimensions();
+
+    output_indices_.resize(output_dimensions_.size());
+    input_indices_.resize(output_dimensions_.size());
+
+    // Compute index_factors
+    index_factors_.resize(rank_);
+    tensorflow::int64 last_index_factor = 1;
+    for (auto r = rank_ - 1; r >= 0; --r) {
+      index_factors_[r] = last_index_factor;
+      last_index_factor *= input_dimensions[r];
+    }
+  }
+
+  tensorflow::int64 linear_input_index() const { return linear_input_index_; }
+
+  void MoveToOutputIndex(tensorflow::int64 output_index);
+  void IncrementOutputIndex();
+
+ private:
+  void RecomputeInputAdjustableDimensionIndex() {
+    tensorflow::int64 index = adjustable_dimension_carriage_sum_;
+    index *= output_dimensions_[adjustable_dimension_];
+    index += output_indices_[adjustable_dimension_];
+    input_indices_[adjustable_dimension_] = index;
+  }
+
+  std::vector<tensorflow::int64> TensorShapeToVector(
+      const tensorflow::TensorShape& tensor_shape);
+
+  std::vector<tensorflow::int64> ComputeDimensionCeiling(
+      const std::vector<tensorflow::int64>& output_dimensions,
+      const std::vector<tensorflow::int64>& input_dimensions);
+
+  std::vector<tensorflow::int64> ComputeCumulativeDimensions();
+
+  const std::vector<tensorflow::int64> output_dimensions_;
+  std::vector<tensorflow::int64> dimension_ceiling_;
+  std::vector<tensorflow::int64> index_factors_;
+  std::vector<tensorflow::int64> cumulative_dimensions_;
+  std::vector<tensorflow::int64> output_indices_;
+  std::vector<tensorflow::int64> input_indices_;
+
+  const int adjustable_dimension_;
+  const int rank_;
+  tensorflow::int64 linear_output_index_;
+  tensorflow::int64 linear_input_index_;
+  tensorflow::int64 adjustable_dimension_carriage_sum_;
+};
+
+void InputIndexer::MoveToOutputIndex(tensorflow::int64 output_index) {
+  linear_output_index_ = output_index;
+  linear_input_index_ = 0;
 
   // un-rasterize the output index
   auto last_reduced_i = output_index;
-  for (auto r = rank - 1; r >= 0; --r) {
-    (*output_indices)[r] = last_reduced_i % (*target_dimensions)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    output_indices_[r] = last_reduced_i % output_dimensions_[r];
     last_reduced_i =
-        (last_reduced_i - (*output_indices)[r]) / (*target_dimensions)[r];
+        (last_reduced_i - output_indices_[r]) / output_dimensions_[r];
   }
 
+  tensorflow::int64 carriage_sum = 0;
+  for (int qi = 0; qi < rank_; ++qi) {
+    if (qi == adjustable_dimension_) continue;
+    carriage_sum += cumulative_dimensions_[qi] *
+                    (output_indices_[qi] % dimension_ceiling_[qi]);
+  }
+  adjustable_dimension_carriage_sum_ = carriage_sum;
+
   // rasterize the input index
-  IndexT last_index_factor = 1;
-  for (auto r = rank - 1; r >= 0; --r) {
-    IndexT index = 0;
-    if (r != adjustable_dimension)
-      index = (*output_indices)[r] / dimension_ceiling[r];
-    else {
-      for (int qi = 0; qi < rank; ++qi) {
-        if (qi == adjustable_dimension) continue;
-        index += cumulative_dimensions[qi] *
-                 ((*output_indices)[qi] % dimension_ceiling[qi]);
-      }
-      index *= (*target_dimensions)[adjustable_dimension];
-      index += (*output_indices)[r];
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    if (r != adjustable_dimension_) {
+      input_indices_[r] = output_indices_[r] / dimension_ceiling_[r];
+    } else {
+      RecomputeInputAdjustableDimensionIndex();
     }
-    *result += last_index_factor * index;
-    last_index_factor *= original_dimensions[r];
   }
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    linear_input_index_ += index_factors_[r] * input_indices_[r];
+  }
+}
+
+void InputIndexer::IncrementOutputIndex() {
+  linear_output_index_++;
+  for (auto r = rank_ - 1; r >= 0; --r) {
+    auto old_carriage_sum_increment =
+        cumulative_dimensions_[r] *
+        (output_indices_[r] % dimension_ceiling_[r]);
+    output_indices_[r] = (output_indices_[r] + 1) % output_dimensions_[r];
+    if (r != adjustable_dimension_) {
+      auto new_input_index = output_indices_[r] / dimension_ceiling_[r];
+      linear_input_index_ +=
+          (new_input_index - input_indices_[r]) * index_factors_[r];
+
+      input_indices_[r] = new_input_index;
+
+      auto new_carriage_sum_increment =
+          cumulative_dimensions_[r] *
+          (output_indices_[r] % dimension_ceiling_[r]);
 
-  return *result;
+      adjustable_dimension_carriage_sum_ = adjustable_dimension_carriage_sum_ -
+                                           old_carriage_sum_increment +
+                                           new_carriage_sum_increment;
+    }
+
+    if (output_indices_[r] != 0) {
+      // No more carries to higher indices.
+      break;
+    }
+  }
+  auto old_adjustable_dimension_input_index =
+      input_indices_[adjustable_dimension_];
+  RecomputeInputAdjustableDimensionIndex();
+  linear_input_index_ += (input_indices_[adjustable_dimension_] -
+                           old_adjustable_dimension_input_index) *
+                          index_factors_[adjustable_dimension_];
 }
 
-template <class InputDataT,
-          class IndexVecT>  // both types are needed here b/c IndexVecT and
-                            // InputDataT are not related
-                            void
-                            fill_periodic_tensor(
-                                tensorflow::OpKernelContext* context,
-                                const IndexVecT& desired_shape,
-                                const tensorflow::Tensor& input_tensor) {
-  // input is a strided array (last index is fastest, C-ordered)
-  auto input = input_tensor.flat<InputDataT>();
-  const int rank = input_tensor.dims();
-  // original and target dimensions
-  std::vector<tensorflow::int64> original_dimensions(rank),
-      target_dimensions(rank);
-  tensorflow::int64 total_size(input_tensor.NumElements()), new_sliced_size(1);
-  // factors by which original_dimensions increases/decreases w.r.t.
-  // target_dimensions
-  std::vector<tensorflow::int64> dimension_ceiling(rank),
-      cumulative_dimensions(rank);
-  // index of adjustable dimension
-  int adjustable_dimension;
-  tensorflow::TensorShape output_shape;
+std::vector<tensorflow::int64> InputIndexer::TensorShapeToVector(
+    const tensorflow::TensorShape& tensor_shape) {
+  std::vector<tensorflow::int64> result(tensor_shape.dims());
+  int count = 0;
+  for (const auto dim_info : tensor_shape) {
+    result[count] = dim_info.size;
+    ++count;
+  }
+  return result;
+}
 
-  // requires that the rank of the input tensor and length of the desired shape
-  // are equal
-  OP_REQUIRES(context, rank == desired_shape.size(),
-              tensorflow::errors::InvalidArgument(
-                  "periodic_resample expects the rank of the input tensor, ",
-                  rank, ", to be the same as the length of the desired shape, ",
-                  desired_shape.size(), "."));
+std::vector<tensorflow::int64> InputIndexer::ComputeDimensionCeiling(
+    const std::vector<tensorflow::int64>& output_dimensions,
+    const std::vector<tensorflow::int64>& input_dimensions) {
+  std::vector<tensorflow::int64> dimension_ceiling(input_dimensions.size());
+  for (size_t i = 0; i < input_dimensions.size(); ++i) {
+    dimension_ceiling[i] = (output_dimensions[i] + input_dimensions[i] - 1) /
+        input_dimensions[i];
+  }
+  return dimension_ceiling;
+}
 
-  bool found = false;
-  const auto& input_tensor_shape = input_tensor.shape();
+std::vector<tensorflow::int64> InputIndexer::ComputeCumulativeDimensions() {
+  std::vector<tensorflow::int64> cumulative_dimensions(rank_);
+  int count = 0;
+  for (int i = 0; i < rank_; ++i) {
+    if (count == 0) {
+      cumulative_dimensions[count] = 1;
+    } else {
+      cumulative_dimensions[count] =
+          cumulative_dimensions[count - 1] * dimension_ceiling_[count - 1];
+    }
+    ++count;
+  }
+  return cumulative_dimensions;
+}
 
+template <typename IndexVecT>
+void process_desired_shape(tensorflow::OpKernelContext* context,
+                           const tensorflow::TensorShape& input_tensor_shape,
+                           const IndexVecT& desired_shape,
+                           int* adjustable_dimension,
+                           std::vector<tensorflow::int64>* target_dimensions,
+                           tensorflow::int64* output_size) {
+  tensorflow::int64 new_sliced_size = 1;
+  bool found = false;
+  const int rank = input_tensor_shape.dims();
   for (int i = 0; i < rank; ++i) {
-    // if (desired_shape(i) < 1) {
     if (desired_shape[i] < 1) {
       // only one index can be adjustable
       OP_REQUIRES(context, !found,
                   tensorflow::errors::InvalidArgument(
                       "periodic_resample expects only "
                       "one index to be marked as adjustable."));
-      adjustable_dimension = i;
+      *adjustable_dimension = i;
       found = true;
     } else {
       OP_REQUIRES(
@@ -122,9 +232,8 @@ template <class InputDataT,
               i, " input tensor has size ", input_tensor_shape.dim_size(i),
               ", desired shape has size ", desired_shape[i], "."));
 
-      // target_dimensions[i] = desired_shape(i);
-      target_dimensions[i] = desired_shape[i];
-      new_sliced_size *= target_dimensions[i];
+      (*target_dimensions)[i] = desired_shape[i];
+      new_sliced_size *= (*target_dimensions)[i];
     }
   }
   // at least one index needs to be adjustable
@@ -132,26 +241,50 @@ template <class InputDataT,
               tensorflow::errors::InvalidArgument(
                   "periodic_resample expects at least "
                   "one index to be marked as adjustable."));
+  (*target_dimensions)[*adjustable_dimension] =
+      input_tensor_shape.num_elements() / new_sliced_size;
 
-  int count = 0;
-  for (const auto dim_info : input_tensor.shape()) {
-    original_dimensions[count] = dim_info.size;
-    ++count;
-  }
+  *output_size = new_sliced_size * (*target_dimensions)[*adjustable_dimension];
+}
 
-  target_dimensions[adjustable_dimension] = total_size / new_sliced_size;
+// Heuristic number based on measurements on
+// Intel(R) Core(TM) i7-4930K CPU @ 3.40GHz
+const tensorflow::int64 costPerFillIndex = 35;
 
-  count = 0;
-  for (int i = 0; i < input_tensor.shape().dims(); ++i) {
-    dimension_ceiling[count] = tensorflow::int64(std::ceil(
-        float(target_dimensions[count]) / float(original_dimensions[count])));
-    if (count == 0)
-      cumulative_dimensions[count] = 1;
-    else
-      cumulative_dimensions[count] =
-          cumulative_dimensions[count - 1] * dimension_ceiling[count - 1];
-    ++count;
-  }
+enum class Mode {
+  kForward,
+  kGradient
+};
+
+// Computes either periodic_resample operation output or gradients for it,
+// depending on |mode|.
+// |original_shape| is always shape of input to periodic_resample operation.
+// |source_tensor| is either source for periodic_resample (for forward mode)
+//     or gradients tensor.
+// |desired_shape| is always shape, provided by user, to which forward
+//     propagation attempts resample input tensor.
+template <class InputDataT, Mode mode>
+void
+do_periodic_resample_op(tensorflow::OpKernelContext* context,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape,
+                        const tensorflow::Tensor& source_tensor) {
+  const int rank = source_tensor.dims();
+
+  // requires that the rank of the input tensor and length of the desired shape
+  // are equal
+  OP_REQUIRES(context, rank == desired_shape.dims(),
+              tensorflow::errors::InvalidArgument(
+                  "periodic_resample expects the rank of the input tensor, ",
+                  rank, ", to be the same as the length of the desired shape, ",
+                  desired_shape.dims(), "."));
+
+  std::vector<tensorflow::int64> target_dimensions(rank);
+  tensorflow::int64 new_size = 0;
+  // index of adjustable dimension
+  int adjustable_dimension = 0;
+  process_desired_shape(context, original_shape, desired_shape.dim_sizes(),
+                        &adjustable_dimension, &target_dimensions, &new_size);
 
   // ensure that the new dimension is greater than zero
   OP_REQUIRES(context, target_dimensions[adjustable_dimension] > 0,
@@ -160,11 +293,14 @@ template <class InputDataT,
                   "adjustable dimension, ",
                   adjustable_dimension, ", isn't greater than zero, ",
                   target_dimensions[adjustable_dimension], "."));
-  for (int i = 0; i < rank; ++i) {
-    output_shape.AddDim(target_dimensions[i]);
+  tensorflow::TensorShape output_shape;
+  if (mode == Mode::kForward) {
+    for (int i = 0; i < rank; ++i) {
+      output_shape.AddDim(target_dimensions[i]);
+    }
+  } else {
+    output_shape = original_shape;
   }
-  const auto new_size =
-      new_sliced_size * target_dimensions[adjustable_dimension];
 
   // Create an output tensor and attach it to the current context
   tensorflow::Tensor* output_tensor = nullptr;
@@ -172,47 +308,73 @@ template <class InputDataT,
                  context->allocate_output(0, output_shape, &output_tensor));
   auto output = output_tensor->flat<InputDataT>();
 
-  // memory is allocated for these variables outside the inner loop for
-  // efficiency (although, I could create a separate class scope for
-  // this purpose instead)
-  tensorflow::int64 result = 0;
-  std::vector<tensorflow::int64> output_indices(target_dimensions.size());
+  // input is a strided array (last index is fastest, C-ordered)
+  auto input = source_tensor.flat<InputDataT>();
 
   // Fill output tensor with periodically resampled input tensor values
-  for (tensorflow::int64 output_index = 0; output_index < new_size;
-       ++output_index) {
-    output(output_index) = input(compute_input_index(
-        &target_dimensions, output_index, original_dimensions,
-        adjustable_dimension, dimension_ceiling, cumulative_dimensions, &result,
-        &output_indices, rank));
-  }
+  InputIndexer input_indexer(target_dimensions, original_shape,
+                             adjustable_dimension);
+
+  auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  auto fill_output_tensor = [&input_indexer, &output, &input](
+      tensorflow::int64 start, tensorflow::int64 limit) {
+    InputIndexer local_indexer(input_indexer);
+    local_indexer.MoveToOutputIndex(start);
+    for (tensorflow::int64 output_index = start; output_index < limit;
+         ++output_index) {
+      if (mode == Mode::kForward) {
+        output(output_index) = input(local_indexer.linear_input_index());
+      } else {
+        output(local_indexer.linear_input_index()) = input(output_index);
+      }
+      local_indexer.IncrementOutputIndex();
+    }
+  };
+  ::tensorflow::Shard(worker_threads.num_threads, worker_threads.workers,
+                      new_size, costPerFillIndex, fill_output_tensor);
 }
 
+#define DATA_TYPE_SWITCH(data_type, context, CASE)                            \
+  switch (data_type) {                                                        \
+    CASE(float)                                                               \
+    CASE(double)                                                              \
+    CASE(tensorflow::int32)                                                   \
+    CASE(tensorflow::int64)                                                   \
+    default:                                                                  \
+      context->CtxFailure(__FILE__, __LINE__,                                 \
+          tensorflow::errors::InvalidArgument(                                \
+              "Unsuppored tensor elements type"));                            \
+      break;                                                                  \
+  }
+
 void create_output_tensor(
     tensorflow::OpKernelContext* context,
     const tensorflow::Tensor& input_tensor,
     const tensorflow::DataType& input_tensor_type,
-    const tensorflow::PartialTensorShape& desired_shape_tensor) {
-  auto desired_shape = desired_shape_tensor.dim_sizes();
-
-  // obligatory type switch
-  switch (input_tensor_type) {
-    case tensorflow::DataTypeToEnum<float>::value:
-      fill_periodic_tensor<float>(context, desired_shape, input_tensor);
+    const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kForward>(                          \
+          context, input_tensor.shape(), desired_shape, input_tensor);        \
       break;
-    case tensorflow::DataTypeToEnum<double>::value:
-      fill_periodic_tensor<double>(context, desired_shape, input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int32>::value:
-      fill_periodic_tensor<tensorflow::int32>(context, desired_shape,
-                                              input_tensor);
-      break;
-    case tensorflow::DataTypeToEnum<tensorflow::int64>::value:
-      fill_periodic_tensor<tensorflow::int64>(context, desired_shape,
-                                              input_tensor);
+
+  DATA_TYPE_SWITCH(input_tensor_type, context, CASE);
+#undef CASE
+}
+
+void create_grad_tensor(tensorflow::OpKernelContext* context,
+                        const tensorflow::Tensor& grad_tensor,
+                        const tensorflow::DataType& grad_tensor_type,
+                        const tensorflow::TensorShape& original_shape,
+                        const tensorflow::PartialTensorShape& desired_shape) {
+#define CASE(type)                                                            \
+    case tensorflow::DataTypeToEnum<type>::value:                             \
+      do_periodic_resample_op<type, Mode::kGradient>(                         \
+          context, original_shape, desired_shape, grad_tensor);               \
       break;
-    default:;
-  }
+
+  DATA_TYPE_SWITCH(grad_tensor_type, context, CASE);
+#undef CASE
 }
 
 }  // namespace
@@ -238,4 +400,25 @@ class PeriodicResampleOp : public tensorflow::OpKernel {
   tensorflow::PartialTensorShape desired_shape;
 };
 
+class PeriodicResampleOpGrad : public tensorflow::OpKernel {
+ public:
+  explicit PeriodicResampleOpGrad(tensorflow::OpKernelConstruction* context)
+      : tensorflow::OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("original_shape", &original_shape));
+    OP_REQUIRES_OK(context, context->GetAttr("desired_shape", &desired_shape));
+  }
+
+  void Compute(tensorflow::OpKernelContext* context) override {
+    const tensorflow::Tensor& grad_tensor = context->input(0);
+    const tensorflow::DataType grad_tensor_type = context->input_dtype(0);
+    create_grad_tensor(context, grad_tensor, grad_tensor_type, original_shape,
+                       desired_shape);
+  }
+
+ private:
+  tensorflow::TensorShape original_shape;
+  tensorflow::PartialTensorShape desired_shape;
+};
+
 #endif  // TENSORFLOW_KERNELS_PERIODICRESAMPLE_OP_H_
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops.cc b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
index 82bd796956..fd38cd09b4 100644
--- a/tensorflow/contrib/periodic_resample/ops/array_ops.cc
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops.cc
@@ -26,7 +26,42 @@ REGISTER_OP("PeriodicResample")
     .Input("values: T")
     .Attr("shape: shape")
     .Output("output: T")
-    .SetShapeFn(shape_inference::ExplicitShape)
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::PartialTensorShape desired_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &desired_shape));
+      shape_inference::ShapeHandle input_tensor_shape = c->input(0);
+      shape_inference::DimensionHandle num_input_elements =
+          c->NumElements(input_tensor_shape);
+      shape_inference::ShapeHandle result_shape_handle;
+      if (!shape_inference::InferenceContext::ValueKnown(num_input_elements)) {
+        TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(
+            desired_shape, &result_shape_handle));
+      } else {
+        const int rank = c->Rank(input_tensor_shape);
+        std::vector<tensorflow::int64> target_dimensions(rank);
+        tensorflow::int64 new_sliced_size = 1;
+        int adjustable_dimension = 0;
+        for (int i = 0; i < rank; ++i) {
+          if (desired_shape.dim_size(i) < 1) {
+            adjustable_dimension = i;
+          } else {
+            target_dimensions[i] = desired_shape.dim_size(i);
+            new_sliced_size *= target_dimensions[i];
+          }
+        }
+        target_dimensions[adjustable_dimension] =
+            shape_inference::InferenceContext::Value(
+                num_input_elements) / new_sliced_size;
+        tensorflow::TensorShape result_shape;
+        for (int i = 0; i < rank; ++i) {
+          result_shape.AddDim(target_dimensions[i]);
+        }
+        TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(
+            result_shape, &result_shape_handle));
+      }
+      c->set_output(0, result_shape_handle);
+      return Status::OK();
+    })
     .Doc(R"doc(
 Periodically resample elements of a tensor to conform to `shape`.
 
@@ -101,4 +136,20 @@ output: Periodically resampled tensor that has dimensions specified as in
 
 )doc");
 
+
+REGISTER_OP("PeriodicResampleOpGrad")
+    .Attr("T: numbertype")
+    .Input("grad: T")
+    .Attr("original_shape: shape")
+    .Attr("desired_shape: shape")
+    .Output("grad_values: T")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      tensorflow::TensorShape original_shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("original_shape", &original_shape));
+      shape_inference::ShapeHandle s;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(original_shape, &s));
+      c->set_output(0, s);
+      return Status::OK();
+});
+
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
new file mode 100644
index 0000000000..55edf76fcd
--- /dev/null
+++ b/tensorflow/contrib/periodic_resample/ops/array_ops_test.cc
@@ -0,0 +1,40 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/shape_inference_testutil.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+TEST(ArrayOpsTest, PeriodicResample_ShapeFn) {
+  ShapeInferenceTestOp op("PeriodicResample");
+  // Case 1: output shape can be fully inferreed.
+  PartialTensorShape shape({4, 4, -1});
+  TensorShapeProto shape_proto;
+  shape.AsProto(&shape_proto);
+
+  TF_ASSERT_OK(NodeDefBuilder("test", "PeriodicResample")
+                   .Input({"values", 0, DT_INT32})
+                   .Attr("shape", shape_proto)
+                   .Finalize(&op.node_def));
+  INFER_OK(op, "[2,2,4]", "[4,4,1]");
+  // Case 2: output shape can not be inferred - report desired shape.
+  INFER_OK(op, "[2,2,?]", "[4,4,?]");
+}
+
+}  // end namespace tensorflow
diff --git a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
index a25de55e18..31a6fe1d94 100644
--- a/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
+++ b/tensorflow/contrib/periodic_resample/python/kernel_tests/periodic_resample_op_test.py
@@ -21,8 +21,11 @@ from __future__ import print_function
 import numpy
 
 from tensorflow.contrib.periodic_resample import periodic_resample
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
@@ -93,7 +96,6 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
   def testPeriodicResampleErrors(self):
     input_tensor = numpy.zeros(shape=[1, 2, 2, 4])
     with self.test_session():
-      variables.global_variables_initializer().run()
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           'Dimension 3 input tensor has size 4, desired shape has size 1'):
@@ -103,6 +105,29 @@ class PeriodicResampleTest(test_util.TensorFlowTestCase):
           '4, to be the same as the length of the desired shape, 3'):
         periodic_resample(input_tensor, [None, 4, 4]).eval()
 
+  def testPeriodicResampleGradient(self):
+    desired_shape = numpy.array([4, 4, None])
+    result_shape = (4, 4, 1)
+    input_shape = (2, 2, 4)
+    with self.test_session() as sess:
+      x = array_ops.placeholder(dtypes.float32, shape=input_shape)
+      output = periodic_resample(x, desired_shape)
+      error = gradient_checker.compute_gradient_error(
+          x, input_shape, output, result_shape)
+      self.assertLess(error, 1e-4)
+
+  def testPeriodicResampleShapeInference(self):
+    with self.test_session() as sess:
+      # Case 1: output shape can be fully inferreed.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, 4))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertEqual(output.shape, [4, 4, 1])
+      # Case 2: output shape can not be inferred - report desired shape.
+      x = array_ops.placeholder(dtypes.float32, shape=(2, 2, None))
+      output = periodic_resample(x, [4, 4, None])
+      self.assertTrue(output.shape.is_compatible_with([4, 4, None]))
+      self.assertEqual(output.shape[2].value, None)
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
index 348623d8f8..470e300ccb 100644
--- a/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
+++ b/tensorflow/contrib/periodic_resample/python/ops/periodic_resample_op.py
@@ -21,11 +21,17 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.contrib.periodic_resample.python.ops import gen_periodic_resample_op
 
-from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample
+from tensorflow.contrib.periodic_resample.python.ops.gen_periodic_resample_op import periodic_resample, periodic_resample_op_grad
 
 from tensorflow.contrib.util import loader
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import resource_loader
 # pylint: enable=unused-import
 
 _periodic_resample_op = loader.load_op_library(
     resource_loader.get_path_to_datafile('_periodic_resample_op.so'))
+
+@ops.RegisterGradient("PeriodicResample")
+def _periodic_resample_grad_cc(op, grad):
+  return periodic_resample_op_grad(
+      grad, op.inputs[0].shape, op.get_attr('shape'))
-- 
GitLab


From 310a51bd875bbac16cb2829e16428fca04fc3a29 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Mon, 4 Jun 2018 17:15:05 -0700
Subject: [PATCH 287/610] HloParser: use uint16 in U16 case PiperOrigin-RevId:
 199220422

---
 tensorflow/compiler/xla/service/hlo_parser.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 09c05c9821..ec20606d2f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -1391,8 +1391,8 @@ bool HloParser::SetValueInLiteral(tensorflow::int64 value,
       return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
                                                         literal);
     case U16:
-      return SetValueInLiteralHelper<tensorflow::uint8>(value, linear_index,
-                                                        literal);
+      return SetValueInLiteralHelper<tensorflow::uint16>(value, linear_index,
+                                                         literal);
     case U32:
       return SetValueInLiteralHelper<tensorflow::uint32>(value, linear_index,
                                                          literal);
-- 
GitLab


From 35c8574e49aadcf16d009717e1d31fcce148db02 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 4 Jun 2018 17:23:10 -0700
Subject: [PATCH 288/610] [XLA] Don't dump subgraphs twice in hlo_graph_dumper.

Surprisingly a subgraph twice mostly worked.  But it broke the rollover
edge highlighting, and it also drew all the edges in the subgraph twice.

PiperOrigin-RevId: 199221368
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 54 ++++++++++---------
 1 file changed, 28 insertions(+), 26 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 05adb45713..61612bebd1 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -590,15 +590,26 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
 string HloDotDumper::DumpSubcomputation(const HloComputation* subcomp,
                                         const HloInstruction* parent_instr) {
   VLOG(2) << "Dumping subcomputation " << subcomp->name();
-  const char* computation_fmt = R"(subgraph %s {
-%s
-label = <%s>;
-labelloc = t;
-tooltip = " ";
-%s
-}  // %s
+  // Add an edge from the subcomputation to its parent node.  If subcomp
+  // belongs to a fusion node, it's drawn in place of the fusion instruction,
+  // so there's no need to link those.
+  if (parent_instr->opcode() != HloOpcode::kFusion) {
+    const HloInstruction* from = GetNodeForEdge(subcomp->root_instruction());
+    VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name()
+            << " as " << next_edge_id_;
+    edge_ids_.insert({{from, parent_instr}, next_edge_id_++});
+    const char* edge_fmt =
+        R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)";
+    edges_.push_back(Printf(
+        edge_fmt, InstructionId(from), InstructionId(parent_instr),
+        SubcomputationId(subcomp), subcomp->name(), parent_instr->name()));
+  }
 
-)";
+  // Have we already dumped this subcomputation?  If so, generating the edge
+  // linking it and parent_instr is all we want to do in this function.
+  if (cluster_ids_.find(subcomp) != cluster_ids_.end()) {
+    return "";
+  }
 
   cluster_ids_[subcomp] = next_cluster_id_++;
 
@@ -645,25 +656,16 @@ tooltip = " ";
 
   string comp_body = DumpComputation(subcomp);
 
-  // Add an edge from the subcomputation to its parent node.  If subcomp
-  // belongs to a fusion node, it's drawn in place of the fusion instruction,
-  // so there's no need to link those.
-  if (parent_instr->opcode() != HloOpcode::kFusion) {
-    const HloInstruction* from = GetNodeForEdge(subcomp->root_instruction());
-    VLOG(2) << "Edge: from " << from->name() << " to " << parent_instr->name()
-            << " as " << next_edge_id_;
-    edge_ids_.insert({{from, parent_instr}, next_edge_id_++});
-    const char* edge_fmt =
-        R"(%s -> %s [ltail="%s", style="dashed" tooltip="%s -> %s"];)";
-    edges_.push_back(Printf(
-        edge_fmt, InstructionId(from), InstructionId(parent_instr),
-        SubcomputationId(subcomp), subcomp->name(), parent_instr->name()));
-  }
-
-  string computation =
-      Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
+  const char* computation_fmt = R"(subgraph %s {
+%s
+label = <%s>;
+labelloc = t;
+tooltip = " ";
+%s
+}  // %s
 
-  return computation;
+)";
+  return Printf(computation_fmt, id, style, subcomp_label, comp_body, id);
 }
 
 string HloDotDumper::DumpComputation(const HloComputation* comp) {
-- 
GitLab


From 76801dda9b4766d729ab88267ee47f48d05eafb7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 18:57:57 -0700
Subject: [PATCH 289/610] Enable XLA fusions as a Grappler optimization.

PiperOrigin-RevId: 199230907
---
 tensorflow/compiler/jit/BUILD                 |  46 +++
 .../compiler/jit/mark_for_compilation_pass.cc | 161 ++-------
 tensorflow/compiler/jit/xla_cluster_util.cc   | 161 +++++++++
 tensorflow/compiler/jit/xla_cluster_util.h    |  46 +++
 .../compiler/jit/xla_fusion_optimizer.cc      | 321 ++++++++++++++++++
 .../compiler/jit/xla_fusion_optimizer.h       |  49 +++
 .../compiler/jit/xla_fusion_optimizer_test.cc | 183 ++++++++++
 .../custom_graph_optimizer_registry.h         |   2 +-
 .../grappler/optimizers/meta_optimizer.cc     | 100 +++---
 .../core/grappler/optimizers/meta_optimizer.h |   4 +
 10 files changed, 889 insertions(+), 184 deletions(-)
 create mode 100644 tensorflow/compiler/jit/xla_cluster_util.cc
 create mode 100644 tensorflow/compiler/jit/xla_cluster_util.h
 create mode 100644 tensorflow/compiler/jit/xla_fusion_optimizer.cc
 create mode 100644 tensorflow/compiler/jit/xla_fusion_optimizer.h
 create mode 100644 tensorflow/compiler/jit/xla_fusion_optimizer_test.cc

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 6d6c030a26..ab8cd8f4bc 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -25,6 +25,7 @@ load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 
 # Target that bundles up the XLA CPU and GPU JIT devices.
 cc_library(
@@ -312,6 +313,7 @@ cc_library(
         ":common",
         ":shape_inference_helpers",
         ":union_find",
+        ":xla_cluster_util",
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/compiler/jit/kernels:parallel_check_op",
         "//tensorflow/compiler/jit/legacy_flags:encapsulate_subgraphs_pass_flags",
@@ -332,6 +334,18 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "xla_cluster_util",
+    srcs = ["xla_cluster_util.cc"],
+    hdrs = ["xla_cluster_util.h"],
+    deps = [
+        "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:graph",
+        "//tensorflow/core/kernels:bounds_check",
+    ],
+)
+
 cc_library(
     name = "union_find",
     hdrs = ["union_find.h"],
@@ -408,6 +422,38 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "xla_fusion_optimizer",
+    srcs = ["xla_fusion_optimizer.cc"],
+    hdrs = ["xla_fusion_optimizer.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":common",
+        ":union_find",
+        ":xla_cluster_util",
+        "//tensorflow/compiler/jit/graphcycles",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "xla_fusion_optimizer_test",
+    srcs = ["xla_fusion_optimizer_test.cc"],
+    deps = [
+        ":common",
+        ":xla_cluster_util",
+        ":xla_fusion_optimizer",
+        "//tensorflow/core:graph",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler/utils:grappler_test",
+    ],
+)
+
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
 cc_header_only_library(
     name = "xla_jit_headers_lib",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 07ee93d79e..74468266b9 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
 #include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -41,9 +42,6 @@ limitations under the License.
 
 namespace tensorflow {
 
-const char* const kXlaClusterAttr = "_XlaCluster";
-const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
-
 namespace {
 
 // Returns true if, when executed in TensorFlow, `node` is guaranteed to forward
@@ -191,16 +189,6 @@ bool IsCompilableCall(const NodeDef& call_def,
   return true;
 }
 
-// Returns the DeviceType corresponding to 'device'.
-Status DeviceTypeOfDevice(const string& device, DeviceType* device_type) {
-  DeviceNameUtils::ParsedName parsed;
-  if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
-    return errors::Internal("Malformed assigned device '", device, "'");
-  }
-  *device_type = DeviceType(parsed.type);
-  return Status::OK();
-}
-
 // Tests whether `node` has a DT_RESOURCE typed input or output.
 bool HasResourceInputOrOutput(const Node& node) {
   return std::find(node.input_types().begin(), node.input_types().end(),
@@ -209,18 +197,11 @@ bool HasResourceInputOrOutput(const Node& node) {
                    DT_RESOURCE) != node.output_types().end();
 }
 
-struct NodeCompare {
-  bool operator()(const Node* a, const Node* b) const {
-    return a->id() < b->id();
-  }
-};
-using OrderedNodeSet = std::set<Node*, NodeCompare>;
-
 // Returns true if the op can be decomposed into XLA ops for which
 // there are fusable elemental implementations.
 //
-// TODO(hpucha): Consider a black list instead of a white list as
-// implemented below.
+// TODO(hpucha): Remove this code since this functionality is subsumed by
+// Grappler XlaFusionOptimizer.
 bool IsXlaFusable(const NodeDef& node) {
   static const std::unordered_set<std::string>* elementwise_ops =
       new std::unordered_set<std::string>(
@@ -390,7 +371,7 @@ Status FindCompilationCandidates(
   for (Node* node : graph.op_nodes()) {
     sorted_nodes.push_back(node);
   }
-  std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeCompare());
+  std::sort(sorted_nodes.begin(), sorted_nodes.end(), NodeComparatorID());
 
   for (Node* node : sorted_nodes) {
     VLOG(2) << "Fuel: " << fuel;
@@ -405,9 +386,13 @@ Status FindCompilationCandidates(
 
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
-        DeviceTypeOfDevice(node->assigned_device_name(), &device_type));
+        DeviceToDeviceType(node->assigned_device_name(), &device_type));
 
-    if (is_compilable_fn && !is_compilable_fn(node, device_type)) continue;
+    if (is_compilable_fn && !is_compilable_fn(node, device_type)) {
+      VLOG(2) << "Compilation rejected node: not compilable " << node->name()
+              << ": " << node->type_string();
+      continue;
+    }
 
     const XlaOpRegistry::DeviceRegistration* registration;
     CHECK(
@@ -456,46 +441,6 @@ struct Cluster {
   int representative = -1;
 };
 
-// Returns a string describing how an edge from src to dst would
-// create a cycle.
-string DescribeCycle(const GraphCycles& cycles, const Graph& graph, int src,
-                     int dst) {
-  int32 max_path_size = graph.num_node_ids() + 1;
-  std::vector<int32> path(max_path_size);
-  int32 path_size = cycles.FindPath(dst, src, max_path_size, path.data());
-  if (path_size == 0) {
-    return "";
-  }
-
-  auto node_name = [&cycles, &graph](int node_id) {
-    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
-      return string("(null)");
-    }
-    auto* node = graph.FindNodeId(node_id);
-    if (node == nullptr) {
-      return string("(null)");
-    }
-    return node->name();
-  };
-
-  string description;
-  strings::StrAppend(&description, "Edge from ", node_name(src), " to ",
-                     node_name(dst), " would create a cycle.\n");
-  path.resize(path_size);
-  for (int32 node_id : path) {
-    string ascii_art;
-    if (node_id == dst) {
-      ascii_art = "+-> ";
-    } else if (node_id != src) {
-      ascii_art = "|   ";
-    } else {
-      ascii_art = "+-- ";
-    }
-    strings::StrAppend(&description, ascii_art, node_name(node_id), "\n");
-  }
-  return description;
-}
-
 }  // anonymous namespace
 
 bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
@@ -601,84 +546,13 @@ Status MarkForCompilationPass::RunImpl(
                                            : Env::Default(),
       is_compilable_fn, &compilation_candidates));
 
-  GraphCycles cycles;
-  for (int i = 0; i < graph->num_node_ids(); ++i) {
-    // We rely on the node IDs in the cycle detection graph being consecutive
-    // integers starting from 0.
-    CHECK_EQ(i, cycles.NewNode());
+  if (compilation_candidates.empty()) {
+    VLOG(2) << "No compilable candidates";
+    return Status::OK();
   }
 
-  // Compute the loop structure of the graph.
-  std::vector<ControlFlowInfo> control_flow_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &control_flow_info));
-
-  // The clustering code must avoid adding cycles to the graph to prevent
-  // deadlock. However, the graph may contain loops, which would trigger the
-  // cycle detection code. To handle loops, we alter the structure of the cycle
-  // detection graph, disconnecting each loop from the enclosing graph.
-  // Specifically, we:
-  // * add a new "frame" node for each loop.
-  // * replace edges to "Enter" nodes, and edges from "Exit" nodes with edges
-  //   to/from the corresponding frame node. In essence, we collapse the loop
-  //   into a single node for the purpose of cycle detection in the enclosing
-  //   graph.
-  // * the body of the loop should now be disconnected from the rest of the
-  //   graph; we make it acyclic by breaking loop backedges (edges outgoing from
-  //   "NextIteration" nodes.
-
-  // Map from frame name strings to node IDs in the cycle detection graph.
-  std::unordered_map<string, int> frame_nodes;
-
-  // Get the cycle graph node ID for frame 'frame_name', or add one if none
-  // exists.
-  auto GetOrAddFrameNodeId = [&frame_nodes, &cycles](const string& frame_name) {
-    int& frame_id = frame_nodes.emplace(frame_name, -1).first->second;
-    if (frame_id < 0) {
-      // The emplace succeeded; we have not allocated a frame node yet.
-      frame_id = cycles.NewNode();
-    }
-    return frame_id;
-  };
-
-  for (Edge const* edge : graph->edges()) {
-    if (edge->dst()->IsEnter()) {
-      // Lift edges to an "Enter" node to the corresponding frame node.
-      const string& frame_name =
-          control_flow_info[edge->dst()->id()].frame_name;
-      int dst = GetOrAddFrameNodeId(frame_name);
-      if (!cycles.InsertEdge(edge->src()->id(), dst)) {
-        return errors::Internal(
-            "Cycle detected when adding enter->frame edge: ",
-            DescribeCycle(cycles, *graph, edge->src()->id(), dst));
-      }
-      continue;
-    }
-    if (edge->src()->IsExit()) {
-      // Lift edges from an "Exit" node to the corresponding frame node.
-      const string& frame_name =
-          control_flow_info[edge->src()->id()].frame_name;
-      int src = GetOrAddFrameNodeId(frame_name);
-      if (!cycles.InsertEdge(src, edge->dst()->id())) {
-        return errors::Internal(
-            "Cycle detected when adding frame->exit edge: ",
-            DescribeCycle(cycles, *graph, src, edge->dst()->id()));
-      }
-      // Drop the original edge.
-      continue;
-    }
-    if (edge->src()->IsNextIteration()) {
-      // Break loop back-edges.
-      continue;
-    }
-    if (!cycles.InsertEdge(edge->src()->id(), edge->dst()->id())) {
-      // This should never happen. All cycles in the graph should contain
-      // a control flow operator.
-      return errors::Internal(
-          "Found cycle in graph without control flow operator during XLA "
-          "compilation: ",
-          DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id()));
-    }
-  }
+  GraphCycles cycles;
+  TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(graph, &cycles));
 
   // Each compilation candidate belongs to a cluster. The cluster's
   // representative
@@ -696,6 +570,9 @@ Status MarkForCompilationPass::RunImpl(
 
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
+  //
+  // TODO(hpucha): Handle the case where kXlaClusterAttr is already set (for
+  // example, from the Grappler fusion pass).
   while (!worklist.empty()) {
     int from = worklist.front()->Get().representative;
     worklist.pop_front();
@@ -804,7 +681,7 @@ Status MarkForCompilationPass::RunImpl(
     // compilation.
     DeviceType device_type("");
     TF_RETURN_IF_ERROR(
-        DeviceTypeOfDevice(n->assigned_device_name(), &device_type));
+        DeviceToDeviceType(n->assigned_device_name(), &device_type));
     const XlaOpRegistry::DeviceRegistration* registration;
     XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration);
 
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
new file mode 100644
index 0000000000..70bd10336b
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -0,0 +1,161 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/graph/control_flow.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+
+const char* const kXlaClusterAttr = "_XlaCluster";
+const char* const kXlaOutsideCompilationAttr = "_XlaOutsideCompilation";
+
+namespace {
+// Returns a string describing how an edge from src to dst would
+// create a cycle.
+string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src,
+                     int dst) {
+  int32 max_path_size = graph.num_node_ids() + 1;
+  std::vector<int32> path(max_path_size);
+  int32 path_size = cycles->FindPath(dst, src, max_path_size, path.data());
+  if (path_size == 0) {
+    return "";
+  }
+
+  auto node_name = [cycles, &graph](int node_id) {
+    if (!FastBoundsCheck(node_id, graph.num_node_ids())) {
+      return string("(null)");
+    }
+    auto* node = graph.FindNodeId(node_id);
+    if (node == nullptr) {
+      return string("(null)");
+    }
+    return node->name();
+  };
+
+  string description;
+  strings::StrAppend(&description, "Edge from ", node_name(src), " to ",
+                     node_name(dst), " would create a cycle.\n");
+  path.resize(path_size);
+  for (int32 node_id : path) {
+    string ascii_art;
+    if (node_id == dst) {
+      ascii_art = "+-> ";
+    } else if (node_id != src) {
+      ascii_art = "|   ";
+    } else {
+      ascii_art = "+-- ";
+    }
+    strings::StrAppend(&description, ascii_art, node_name(node_id), "\n");
+  }
+  return description;
+}
+}  // namespace
+
+Status DeviceToDeviceType(const string& device, DeviceType* device_type) {
+  DeviceNameUtils::ParsedName parsed;
+  if (!DeviceNameUtils::ParseFullName(device, &parsed)) {
+    return errors::Internal("Malformed assigned device '", device, "'");
+  }
+  *device_type = DeviceType(parsed.type);
+  return Status::OK();
+}
+
+Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
+  for (int i = 0; i < graph->num_node_ids(); ++i) {
+    // We rely on the node IDs in the cycle detection graph being consecutive
+    // integers starting from 0.
+    CHECK_EQ(i, cycles->NewNode());
+  }
+
+  // Compute the loop structure of the graph.
+  std::vector<ControlFlowInfo> control_flow_info;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &control_flow_info));
+
+  // The clustering code must avoid adding cycles to the graph to prevent
+  // deadlock. However, the graph may contain loops, which would trigger the
+  // cycle detection code. To handle loops, we alter the structure of the cycle
+  // detection graph, disconnecting each loop from the enclosing graph.
+  // Specifically, we:
+  // * add a new "frame" node for each loop.
+  // * replace edges to "Enter" nodes, and edges from "Exit" nodes with edges
+  //   to/from the corresponding frame node. In essence, we collapse the loop
+  //   into a single node for the purpose of cycle detection in the enclosing
+  //   graph.
+  // * the body of the loop should now be disconnected from the rest of the
+  //   graph; we make it acyclic by breaking loop backedges (edges outgoing from
+  //   "NextIteration" nodes.
+
+  // Map from frame name strings to node IDs in the cycle detection graph.
+  std::unordered_map<string, int> frame_nodes;
+
+  // Get the cycle graph node ID for frame 'frame_name', or add one if none
+  // exists.
+  auto GetOrAddFrameNodeId = [&frame_nodes, cycles](const string& frame_name) {
+    int& frame_id = frame_nodes.emplace(frame_name, -1).first->second;
+    if (frame_id < 0) {
+      // The emplace succeeded; we have not allocated a frame node yet.
+      frame_id = cycles->NewNode();
+    }
+    return frame_id;
+  };
+
+  for (Edge const* edge : graph->edges()) {
+    if (edge->dst()->IsEnter()) {
+      // Lift edges to an "Enter" node to the corresponding frame node.
+      const string& frame_name =
+          control_flow_info[edge->dst()->id()].frame_name;
+      int dst = GetOrAddFrameNodeId(frame_name);
+      if (!cycles->InsertEdge(edge->src()->id(), dst)) {
+        return errors::Internal(
+            "Cycle detected when adding enter->frame edge: ",
+            DescribeCycle(cycles, *graph, edge->src()->id(), dst));
+      }
+      continue;
+    }
+    if (edge->src()->IsExit()) {
+      // Lift edges from an "Exit" node to the corresponding frame node.
+      const string& frame_name =
+          control_flow_info[edge->src()->id()].frame_name;
+      int src = GetOrAddFrameNodeId(frame_name);
+      if (!cycles->InsertEdge(src, edge->dst()->id())) {
+        return errors::Internal(
+            "Cycle detected when adding frame->exit edge: ",
+            DescribeCycle(cycles, *graph, src, edge->dst()->id()));
+      }
+      // Drop the original edge.
+      continue;
+    }
+    if (edge->src()->IsNextIteration()) {
+      // Break loop back-edges.
+      continue;
+    }
+    if (!cycles->InsertEdge(edge->src()->id(), edge->dst()->id())) {
+      // This should never happen. All cycles in the graph should contain
+      // a control flow operator.
+      return errors::Internal(
+          "Found cycle in graph without control flow operator during XLA "
+          "compilation: ",
+          DescribeCycle(cycles, *graph, edge->src()->id(), edge->dst()->id()));
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
new file mode 100644
index 0000000000..5b673bdc27
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Contains utilities for clustering compilable graph nodes via XLA.
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
+
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/core/graph/algorithm.h"
+
+namespace tensorflow {
+
+// The attribute that marks nodes to be grouped into functions by the
+// encapsulate subgraphs pass.
+extern const char* const kXlaClusterAttr;
+
+// The attribute that marks nodes in a cluster to be placed outside the xla
+// compilation by the encapsulate subgraphs pass.
+extern const char* const kXlaOutsideCompilationAttr;
+
+using OrderedNodeSet = std::set<Node*, NodeComparatorID>;
+
+// Returns the DeviceType corresponding to 'device'.
+Status DeviceToDeviceType(const string& device, DeviceType* device_type);
+
+// Creates a graph representation to enable cycle detection when clustering.
+// This representation handles loops in graph by disconnecting each loop from
+// the enclosing graph.
+Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_CLUSTER_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
new file mode 100644
index 0000000000..96016521ea
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -0,0 +1,321 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_fusion_optimizer.h"
+
+#include <atomic>
+#include <deque>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
+#include "tensorflow/compiler/jit/union_find.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/common_runtime/shape_refiner.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+
+namespace tensorflow {
+
+// Is 'node' an operator that consumes only the shape of its input, not the
+// data itself?
+static bool IsShapeConsumerOp(const Node& node) {
+  return node.type_string() == "Shape" || node.type_string() == "ShapeN" ||
+         node.type_string() == "Rank" || node.type_string() == "Size";
+}
+
+// Returns true if the op can be decomposed into XLA ops for which
+// there are fusable elemental implementations.
+bool IsXlaFusable(const NodeDef& node) {
+  static const std::unordered_set<std::string>* elementwise_ops =
+      new std::unordered_set<std::string>(
+          {// tf2xla/kernels/aggregate_ops.cc
+           "AddN",
+           // tf2xla/kernels/binary_ops.cc
+           "Add", "Sub", "Mul", "Div", "Atan2", "Complex", "FloorDiv",
+           "FloorMod", "BitwiseAnd", "BitwiseOr", "LeftShift", "RightShift",
+           "LogicalAnd", "LogicalOr", "Mod", "Maximum", "Minimum", "RealDiv",
+           "ReciprocalGrad", "RsqrtGrad", "SqrtGrad", "SquaredDifference",
+           "TruncateDiv", "TruncateMod", "Equal", "NotEqual", "Greater",
+           "GreaterEqual", "Less", "LessEqual", "SigmoidGrad", "SoftplusGrad",
+           "SoftsignGrad", "TanhGrad", "Pow", "ApproximateEqual",
+           // tf2xla/kernels/unary_ops.cc
+           "ComplexAbs", "Angle", "Conj", "Abs", "Acos", "Acosh", "Asin",
+           "Asinh", "Atan", "Atanh", "Ceil", "Cos", "Cosh", "Sin", "Exp",
+           "Expm1", "Floor", "IsFinite", "IsInf", "IsNan", "Inv", "Reciprocal",
+           "Log", "Log1p", "Invert", "LogicalNot", "Neg", "Rint", "Round",
+           "Rsqrt", "Sigmoid", "Sign", "Sinh", "Softplus", "Softsign", "Sqrt",
+           "Square", "Tan", "Tanh", "Real", "Imag",
+           // tf2xla/kernels/bcast_ops.cc
+           "BroadcastArgs", "BroadcastGradientArgs",
+           // tf2xla/kernels/bias_ops.cc
+           "BiasAdd", "BiasAddV1", "BiasAddGrad" /*(Reduce)*/,
+           // tf2xla/kernels/cast_op.cc
+           "Cast",
+           // tf2xla/kernels/concat_op.cc
+           "Concat", "ConcatV2", "ConcatOffset",
+           // tf2xla/kernels/const_op.cc
+           "Const",
+           // tf2xla/kernels/elu_op.cc
+           "Elu", "EluGrad", "Selu", "SeluGrad",
+           // tf2xla/kernels/fill_op.cc
+           "Fill",
+           // tf2xla/kernels/identity_op.cc
+           "Identity", "IdentityN", "PreventGradient",
+           "StopGradient", /*"Snapshot",*/
+           // tf2xla/kernels/index_ops.cc
+           "ArgMax", "ArgMin",
+           // tf2xla/kernels/mirror_pad_op.cc
+           "MirrorPad",
+           // tf2xla/kernels/one_hot_op.cc
+           "OneHot",
+           // tf2xla/kernels/pack_op.cc
+           "Pack",
+           // tf2xla/kernels/pad_op.cc
+           "Pad", "PadV2",
+           // tf2xla/kernels/relu_op.cc
+           "Relu", "Relu6", "ReluGrad", "Relu6Grad",
+           // tf2xla/kernels/reshape_op.cc
+           "Reshape",
+           // tf2xla/kernels/reverse_op.cc
+           "Reverse", "ReverseV2",
+           // tf2xla/kernels/reverse_sequence_op.cc
+           "ReverseSequence",
+           // tf2xla/kernels/shape_op.cc
+           "Shape", "ShapeN", "Rank", "Size", "ExpandDims", "Squeeze",
+           "ZerosLike", "OnesLike",
+           // tf2xla/kernels/slice_op.cc
+           "Slice",
+           // tf2xla/kernels/split_op.cc
+           "Split", "SplitV",
+           // tf2xla/kernels/strided_slice_op.cc
+           "StridedSlice", "StridedSliceGrad", "ResourceStridedSliceAssign",
+           // tf2xla/kernels/tile_ops.cc
+           "Tile",
+           // tf2xla/kernels/transpose_op.cc
+           "Transpose", "InvertPermutation",
+           // tf2xla/kernels/unpack_op.cc
+           "Unpack"});
+
+  return elementwise_ops->count(node.op()) > 0;
+}
+
+Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
+                                    const grappler::GrapplerItem& item,
+                                    GraphDef* output) {
+  VLOG(2) << "Here at fusion optimizer";
+
+  // TODO(hpucha): Implement encapsulation and replacing with XlaLaunch op.
+  // Once that happens, the expected interaction between this optimizer and when
+  // the global_jit_level is set is as follows: Fusion optimizer will replace
+  // appropriate fusion clusters with XlaLaunch nodes. The remaining graph can
+  // be further compiled where possible via mark_for_compilation_pass. Note that
+  // this might lead to inefficient clustering, and it is best to use either the
+  // fusion optimizer or the global_jit flag, and not combine the two.
+
+  // Create a Graph out of GraphDef. This is required currently because the
+  // helpers around clustering, encapsulation etc work on graphs.
+  FunctionLibraryDefinition function_library(OpRegistry::Global(),
+                                             item.graph.library());
+  Graph graph(function_library);
+  ShapeRefiner shape_refiner(graph.versions(), graph.op_registry());
+  shape_refiner.set_require_shape_inference_fns(false);
+  shape_refiner.set_disable_constant_propagation(true);
+  ImportGraphDefOptions options;
+  // Graph optimization happens at the late stage of graph execution, when
+  // colocation constraints are already validated previously and the device
+  // placement of nodes has also completed, so there is no need to validate
+  // colocation constraints again.
+  options.validate_colocation_constraints = false;
+  options.validate_shape = false;
+  TF_RETURN_IF_ERROR(
+      ImportGraphDef(options, item.graph, &graph, &shape_refiner));
+
+  // Collect nodes that can be fused via XLA, while ignoring those that
+  // explicitly ask for XLA: (*) nodes that are marked to be compiled
+  // explicitly. (*) nodes assigned to XLA device.
+  OrderedNodeSet compilation_candidates;
+  for (Node* node : graph.op_nodes()) {
+    // If there is a _XlaCompile annotation, ignore the node if it is
+    // true. Nodes are marked with this attr via experimental_jit_scope, and
+    // will be handled by the mark_for_compilation pass.
+    bool compile = false;
+    Status status = GetNodeAttr(node->attrs(), kXlaCompileAttr, &compile);
+    if (status.ok() && compile) {
+      continue;
+    }
+    // If there is already a _XlaCluster annotation, ignore the node. Nodes are
+    // marked with this attr to indicate they are already part of a cluster and
+    // hence ignored.
+    status = GetNodeAttr(node->attrs(), kXlaClusterAttr, &compile);
+    if (status.ok()) {
+      continue;
+    }
+
+    // If there is an explicit XLA device placement, ignore the node.
+    DeviceType device_type("");
+    TF_RETURN_IF_ERROR(DeviceToDeviceType(node->def().device(), &device_type));
+    if (device_type.type_string().find("XLA") != string::npos) continue;
+
+    // Assume all fusable ops are registered.
+    // TODO(hpucha): Check for registration if possible.
+    if (!IsXlaFusable(node->def())) {
+      continue;
+    }
+
+    compilation_candidates.insert(node);
+  }
+
+  if (compilation_candidates.empty()) {
+    VLOG(2) << "No compilable candidates";
+    *output = item.graph;
+    return Status::OK();
+  }
+
+  GraphCycles cycles;
+  TF_RETURN_IF_ERROR(CreateCycleDetectionGraph(&graph, &cycles));
+
+  // TODO(hpucha): Make clustering more robust. There are two known issues that
+  // we need to mitigate: (a) Non-resource variables can cause deadlocks
+  // when clustering changes order of execution. See b/77263461 for a specific
+  // example. (b) Queue operations can also cause deadlocks. See b/77261498 for
+  // example.
+
+  struct Cluster {
+    // Identifies the node that represents this cluster in the cycle detection
+    // graph.
+    int representative = -1;
+  };
+
+  // Each compilation candidate belongs to a cluster. The cluster's
+  // representative names the node in the 'cycles' graph that represents the
+  // cluster.
+  std::vector<UnionFind<Cluster>> clusters(graph.num_node_ids());
+  std::deque<UnionFind<Cluster>*> worklist;
+  for (Node* node : compilation_candidates) {
+    Cluster& cluster = clusters[node->id()].Get();
+    cluster.representative = node->id();
+    worklist.push_back(&clusters[node->id()]);
+  }
+
+  // Repeatedly contract edges between clusters that are on the same device,
+  // provided the contraction would not create a cycle. This is a simplified
+  // version of the clustering in mark_for_compilation_pass that also deals with
+  // nodes that are explicitly tagged to be compiled/clustered.
+  while (!worklist.empty()) {
+    int from = worklist.front()->Get().representative;
+    worklist.pop_front();
+
+    Node* node_from = graph.FindNodeId(from);
+    if (node_from->IsControlFlow()) {
+      // Control flow nodes aren't compilation candidates and should never
+      // appear.
+      return errors::Internal(
+          "Found control flow node in clustering worklist: ",
+          node_from->type_string());
+    }
+    for (int to : cycles.Successors(from)) {
+      if (to >= graph.num_node_ids()) {
+        // Node is a "frame" node that is present only in the cycle detection
+        // graph. No clustering is possible.
+        continue;
+      }
+      Node* node_to = graph.FindNodeId(to);
+      if (compilation_candidates.find(node_to) ==
+          compilation_candidates.cend()) {
+        continue;
+      }
+
+      // Do not cluster across devices.
+      if (node_from->def().device() != node_to->def().device()) {
+        VLOG(2) << "Devices " << node_from->def().device() << " "
+                << node_to->def().device();
+        VLOG(2) << "Device names " << node_from->assigned_device_name() << " "
+                << node_to->assigned_device_name();
+        continue;
+      }
+
+      // Ops that consume shapes cannot be the root of a cluster. This is an
+      // optimization.
+      if (clusters[from].Size() == 1 && IsShapeConsumerOp(*node_from)) {
+        continue;
+      }
+
+      // If contracting the edge would create a cycle, bail out.
+      // However, just because we can't merge the clusters now does not mean
+      // we won't be able to merge them in the future.
+      // e.g., if we have edges 1->2, 2->3 and 1->3, we cannot contract edge
+      // 1->3. But if we first contract 1->2 then we can later contract 1->3.
+      if (!cycles.ContractEdge(from, to)) continue;
+
+      // Merge the clusters. ContractEdge uses 'from' as the number of the
+      // merged node, so make sure 'from' is the chosen representative.
+      clusters[from].Merge(&clusters[to]);
+
+      worklist.push_back(&clusters[from]);
+      break;
+    }
+  }
+
+  // Count the number of non-trivial elements in each cluster.
+  std::vector<int> effective_cluster_sizes(graph.num_node_ids());
+  for (const Node* n : compilation_candidates) {
+    int cluster = clusters[n->id()].Get().representative;
+    // Identity nodes will be removed if the node gets marked for compilation.
+    // Therefore we don't want to count them towards the effective cluster size.
+    if (n->def().op() != "Identity") {
+      effective_cluster_sizes[cluster]++;
+    }
+  }
+
+  const int min_cluster_size = 2;
+  int num_clusters = 0;
+  for (auto size : effective_cluster_sizes) {
+    if (size >= min_cluster_size) {
+      VLOG(3) << "Cluster " << num_clusters << " " << size;
+      num_clusters++;
+    }
+  }
+
+  // Names for each cluster.
+  std::unordered_map<int, string> cluster_names;
+  // Sequence number generator to ensure clusters have unique names.
+  static std::atomic<int64> cluster_sequence_num;
+
+  for (Node* n : compilation_candidates) {
+    int cluster = clusters[n->id()].Get().representative;
+
+    // Compile if this is a cluster of >= min_cluster_size compilable operators.
+    if (effective_cluster_sizes[cluster] >= min_cluster_size) {
+      string& name = cluster_names[cluster];
+
+      if (name.empty()) {
+        name = strings::StrCat("cluster_", cluster_sequence_num++);
+      }
+      n->AddAttr(kXlaClusterAttr, name);
+      VLOG(3) << "Assigning node " << n->name() << " to cluster " << name;
+    }
+  }
+
+  graph.ToGraphDef(output);
+  return Status::OK();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(XlaFusionOptimizer, "xla-fusion");
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.h b/tensorflow/compiler/jit/xla_fusion_optimizer.h
new file mode 100644
index 0000000000..3d2309e782
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.h
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+
+// Optimizes graphs by fusing ops where possible, resulting in more efficient
+// execution.
+class XlaFusionOptimizer : public grappler::CustomGraphOptimizer {
+ public:
+  XlaFusionOptimizer() {}
+  ~XlaFusionOptimizer() override {}
+
+  Status Init(
+      const RewriterConfig_CustomGraphOptimizer* config = nullptr) override {
+    return Status::OK();
+  }
+
+  string name() const override { return "xla-fusion"; };
+
+  Status Optimize(grappler::Cluster* cluster,
+                  const grappler::GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(grappler::Cluster* cluster, const grappler::GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override {
+    // Nothing to do for XlaFusionOptimizer.
+  }
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_FUSION_OPTIMIZER_H_
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
new file mode 100644
index 0000000000..5736760a87
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer_test.cc
@@ -0,0 +1,183 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/jit/xla_fusion_optimizer.h"
+#include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/xla_cluster_util.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/graph/graph_def_builder_util.h"
+#include "tensorflow/core/grappler/utils/grappler_test.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+REGISTER_OP("UncompilableNullary").Output("o: float");
+REGISTER_OP("UncompilableUnary").Input("a: float").Output("o: float");
+
+class XlaFusionOptimizerTest : public grappler::GrapplerTest {
+ protected:
+  std::unordered_map<string, string> GetClusters(const GraphDef& graph) {
+    std::unordered_map<string, string> ids;
+    for (const NodeDef& node : graph.node()) {
+      string cluster;
+      if (GetNodeAttr(AttrSlice(node), kXlaClusterAttr, &cluster).ok()) {
+        CHECK(!cluster.empty());
+        ids[node.name()] = cluster;
+      }
+    }
+    return ids;
+  }
+};
+
+TEST_F(XlaFusionOptimizerTest, Chains) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a =
+        ops::SourceOp("UncompilableNullary", builder.opts().WithName("A"));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    Node* c = ops::UnaryOp("Relu", b, builder.opts().WithName("C"));
+    Node* d =
+        ops::UnaryOp("UncompilableUnary", c, builder.opts().WithName("D"));
+    Node* e = ops::UnaryOp("Relu", d, builder.opts().WithName("E"));
+    ops::UnaryOp("Relu", e, builder.opts().WithName("F"));
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_EQ(4, clusters.size());
+  EXPECT_EQ(clusters["B"], clusters["C"]);
+  EXPECT_EQ(clusters["E"], clusters["F"]);
+  EXPECT_NE(clusters["B"], clusters["E"]);
+  EXPECT_TRUE(clusters.find("A") == clusters.cend());
+  EXPECT_TRUE(clusters.find("D") == clusters.cend());
+}
+
+TEST_F(XlaFusionOptimizerTest, FusableOps) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("A").WithAttr("dtype", tensorflow::DT_FLOAT));
+    Node* b = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("B").WithAttr("dtype", tensorflow::DT_FLOAT));
+
+    Node* c = ops::BinaryOp("Add", a, b, builder.opts().WithName("C"));
+    ops::BinaryOp("MatMul", a, c, builder.opts().WithName("D"));
+    ops::UnaryOp("Abs", c, builder.opts().WithName("E"));
+
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_EQ(2, clusters.size());
+  EXPECT_EQ(clusters["C"], clusters["E"]);
+  EXPECT_TRUE(clusters.find("D") == clusters.cend());
+}
+
+TEST_F(XlaFusionOptimizerTest, IgnoreExplicitXLAAttrs) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("A").WithAttr("dtype", tensorflow::DT_FLOAT));
+    Node* b = ops::SourceOp(
+        "Placeholder",
+        builder.opts().WithName("B").WithAttr("dtype", tensorflow::DT_FLOAT));
+
+    Node* c = ops::BinaryOp(
+        "Add", a, b,
+        builder.opts().WithName("C").WithDevice("/device:XLA_CPU"));
+    ops::BinaryOp("MatMul", a, c, builder.opts().WithName("D"));
+    Node* e = ops::UnaryOp("Abs", c, builder.opts().WithName("E"));
+    ops::UnaryOp("Cos", e,
+                 builder.opts().WithName("F").WithAttr(kXlaCompileAttr, true));
+
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST_F(XlaFusionOptimizerTest, UncompilableCycles) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b =
+        ops::UnaryOp("UncompilableUnary", a, builder.opts().WithName("B"));
+    ops::BinaryOp("Mul", a, b, builder.opts().WithName("C"));
+
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_TRUE(clusters.empty());
+}
+
+TEST_F(XlaFusionOptimizerTest, CompilableCycles) {
+  GraphDef graph;
+  {
+    GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+    Node* a = ops::SourceOp("Const", builder.opts()
+                                         .WithName("A")
+                                         .WithAttr("dtype", DT_FLOAT)
+                                         .WithAttr("value", Tensor()));
+    Node* b = ops::UnaryOp("Relu", a, builder.opts().WithName("B"));
+    ops::BinaryOp("Mul", a, b, builder.opts().WithName("C"));
+    TF_ASSERT_OK(builder.ToGraphDef(&graph));
+  }
+  grappler::GrapplerItem item;
+  item.graph = graph;
+
+  XlaFusionOptimizer optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+  auto clusters = GetClusters(output);
+  EXPECT_EQ(3, clusters.size());
+  EXPECT_EQ(clusters["A"], clusters["B"]);
+  EXPECT_EQ(clusters["A"], clusters["C"]);
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
index 3148a5f809..0b8e0b692a 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h
@@ -50,7 +50,7 @@ class CustomGraphOptimizerRegistrar {
 
 #define REGISTER_GRAPH_OPTIMIZER_AS(MyCustomGraphOptimizerClass, name) \
   namespace {                                                          \
-  static CustomGraphOptimizerRegistrar                                 \
+  static ::tensorflow::grappler::CustomGraphOptimizerRegistrar         \
       MyCustomGraphOptimizerClass##_registrar(                         \
           []() { return new MyCustomGraphOptimizerClass; }, (name));   \
   }  // namespace
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index e6622486eb..143d9dc1c6 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -217,23 +217,9 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
 
   bool is_optimized = false;
   GraphOptimizationResult optimization_result(item.id);
+  GraphOptimizer* fusion_optimizer = nullptr;
+  GraphOptimizer* sa_optimizer = nullptr;
 
-  // ScopedAllocatorOptimizer must run last, so move it to the
-  // end of optimizers and run only on the last iteration.
-  {
-    int sa_index = 0;
-    for (; sa_index < optimizers.size(); ++sa_index) {
-      if (optimizers[sa_index]->name() == "scoped_allocator_optimizer") {
-        break;
-      }
-    }
-    const int last_index = optimizers.size() - 1;
-    if (sa_index < last_index) {
-      optimizers[last_index].swap(optimizers[sa_index]);
-    }
-  }
-
-  const int last_iteration = NumIterations(cfg_) - 1;
   for (int iteration = 0; iteration < NumIterations(cfg_); ++iteration) {
     VLOG(4) << "Starting optimization iteration " << iteration + 1;
 
@@ -241,37 +227,40 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
       // Some optimizers can run only once.
       if (iteration > 0 && IsRunOnceOptimizer(optimizer->name())) continue;
       // Some must run only on the last iteration.
-      if (optimizer->name() == "scoped_allocator_optimizer" &&
-          iteration != last_iteration)
+      if (optimizer->name() == "scoped_allocator_optimizer") {
+        if (sa_optimizer == nullptr) sa_optimizer = optimizer.get();
+        continue;
+      }
+      if (optimizer->name() == "xla-fusion") {
+        if (fusion_optimizer == nullptr) fusion_optimizer = optimizer.get();
         continue;
-
-      uint64 start_us = Env::Default()->NowMicros();
-      // This swaps the current optimized_graph into optimized item and
-      // resets optimized_graph to an empty graph.
-      optimized_graph->Swap(&optimized_item.graph);
-      *optimized_graph = GraphDef();
-      Status status =
-          optimizer->Optimize(cluster, optimized_item, optimized_graph);
-      uint64 end_us = Env::Default()->NowMicros();
-
-      string result;
-      if (!status.ok()) {
-        optimized_graph->Swap(&optimized_item.graph);
-        result = status.ToString();
-      } else {
-        is_optimized = true;
-        float duration_ms = (end_us - start_us) / 1000.0f;
-        result = strings::StrCat(
-            PrintSizesBeforeAfter(optimized_item.graph, *optimized_graph),
-            ", time = ", duration_ms, "ms.");
       }
-      VLOG(4) << optimizer->name() << ": " << result;
 
-      OptimizerResult optimizer_result{optimizer->name(), result};
-      optimization_result.results.push_back(optimizer_result);
+      Status status = RunOptimizer(optimizer.get(), cluster, &optimized_item,
+                                   optimized_graph, &optimization_result);
+      if (status.ok()) is_optimized = true;
     }
   }
 
+  // Run fusion optimizer if requested after all other optimizers since: 1) it
+  // doesn't need to be called more than once. 2) we don't want subsequent
+  // optimization passes to break the fusion clusters. We could potentially
+  // encapsulate the fusion clusters right away, but that will prevent a lot of
+  // optimizations from taking place since we don't have shape inference for
+  // functions, and we can't optimize across function boundaries.
+  if (fusion_optimizer != nullptr) {
+    Status status = RunOptimizer(fusion_optimizer, cluster, &optimized_item,
+                                 optimized_graph, &optimization_result);
+    if (status.ok()) is_optimized = true;
+  }
+
+  // ScopedAllocatorOptimizer must run last.
+  if (sa_optimizer != nullptr) {
+    Status status = RunOptimizer(sa_optimizer, cluster, &optimized_item,
+                                 optimized_graph, &optimization_result);
+    if (status.ok()) is_optimized = true;
+  }
+
   // Record graph optimization result.
   optimization_results_.push_back(optimization_result);
 
@@ -286,6 +275,35 @@ Status MetaOptimizer::OptimizeGraph(Cluster* cluster, const GrapplerItem& item,
   return Status::OK();
 }
 
+Status MetaOptimizer::RunOptimizer(
+    GraphOptimizer* optimizer, Cluster* cluster, GrapplerItem* optimized_item,
+    GraphDef* optimized_graph, GraphOptimizationResult* optimization_result) {
+  uint64 start_us = Env::Default()->NowMicros();
+  // This swaps the current optimized_graph into optimized item and
+  // resets optimized_graph to an empty graph.
+  optimized_graph->Swap(&optimized_item->graph);
+  *optimized_graph = GraphDef();
+  Status status =
+      optimizer->Optimize(cluster, *optimized_item, optimized_graph);
+  uint64 end_us = Env::Default()->NowMicros();
+
+  string result;
+  if (!status.ok()) {
+    optimized_graph->Swap(&optimized_item->graph);
+    result = status.ToString();
+  } else {
+    float duration_ms = (end_us - start_us) / 1000.0f;
+    result = strings::StrCat(
+        PrintSizesBeforeAfter(optimized_item->graph, *optimized_graph),
+        ", time = ", duration_ms, "ms.");
+  }
+  VLOG(4) << optimizer->name() << ": " << result;
+
+  OptimizerResult optimizer_result{optimizer->name(), result};
+  optimization_result->results.push_back(optimizer_result);
+  return status;
+}
+
 Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                GraphDef* optimized_graph) {
   optimization_results_.clear();
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index e736dd174e..151a54cbdf 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -72,6 +72,10 @@ class MetaOptimizer : public GraphOptimizer {
     std::vector<OptimizerResult> results;
   };
 
+  Status RunOptimizer(GraphOptimizer* optimizer, Cluster* cluster,
+                      GrapplerItem* optimized_item, GraphDef* optimized_graph,
+                      GraphOptimizationResult* optimization_result);
+
   std::vector<GraphOptimizationResult> optimization_results_;
 };
 
-- 
GitLab


From a3c642c945b4a27e5d826eb9c9cbc07132cb2bba Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Fri, 1 Jun 2018 18:00:43 -0700
Subject: [PATCH 290/610] Remove use of absl::make_unique

absl is not yet ready for use by open source TensorFlow. :-(

PiperOrigin-RevId: 198952953
---
 tensorflow/contrib/cloud/kernels/gcs_config_ops.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
index ef4998212e..648a219fb8 100644
--- a/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
+++ b/tensorflow/contrib/cloud/kernels/gcs_config_ops.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cloud/curl_http_request.h"
 #include "tensorflow/core/platform/cloud/gcs_file_system.h"
 #include "tensorflow/core/platform/cloud/oauth_client.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 namespace {
@@ -96,7 +97,8 @@ class GcsCredentialsOpKernel : public OpKernel {
         errors::InvalidArgument("JSON format incompatible; did not find fields "
                                 "`refresh_token` or `private_key`."));
 
-    auto provider = absl::make_unique<ConstantAuthProvider>(json, ctx->env());
+    auto provider =
+        tensorflow::MakeUnique<ConstantAuthProvider>(json, ctx->env());
 
     // Test getting a token
     string dummy_token;
@@ -121,7 +123,7 @@ class GcsCredentialsOpKernel : public OpKernel {
           initial_retry_delay_usec_(initial_retry_delay_usec) {}
 
     ConstantAuthProvider(const Json::Value& json, Env* env)
-        : ConstantAuthProvider(json, absl::make_unique<OAuthClient>(), env,
+        : ConstantAuthProvider(json, tensorflow::MakeUnique<OAuthClient>(), env,
                                kInitialRetryDelayUsec) {}
 
     ~ConstantAuthProvider() override {}
-- 
GitLab


From 6eb43fc26785c4835747a79b3d6a3e094ef1c60f Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 4 Jun 2018 12:05:14 -0700
Subject: [PATCH 291/610] Fix test user ops

PiperOrigin-RevId: 199171316
---
 tensorflow/tools/ci_build/builds/test_user_ops.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/tools/ci_build/builds/test_user_ops.sh b/tensorflow/tools/ci_build/builds/test_user_ops.sh
index c342367bac..25ecee4725 100755
--- a/tensorflow/tools/ci_build/builds/test_user_ops.sh
+++ b/tensorflow/tools/ci_build/builds/test_user_ops.sh
@@ -239,8 +239,9 @@ function run_op() {
   fi
 }
 
-run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")
-run_op $("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}))") " in eager mode"
+run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; print(tf.Session('').run(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT})))")"
+run_op "$("${PYTHON_BIN_PATH}" -c "import tensorflow as tf; tf.enable_eager_execution(); print(tf.load_op_library('./${USER_OP_SO}').${USER_OP}(${OP_INPUT}).numpy())")" " in eager mode"
+
 
 popd
 
-- 
GitLab


From 0bb7c844dd4375d7f53c88a7eacf78b0d6552498 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Mon, 4 Jun 2018 12:08:15 -0700
Subject: [PATCH 292/610] Fix Python API.

PiperOrigin-RevId: 199171845
---
 tensorflow/contrib/lite/python/convert_saved_model.py    | 4 ++--
 .../contrib/lite/python/convert_saved_model_test.py      | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index b952a72aab..5dad49f1ed 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -216,9 +216,9 @@ def set_tensor_shapes(tensors, shapes):
   """
   if shapes:
     for tensor in tensors:
-      shape = shapes.get(tensor.name)
+      shape = shapes.get(tensor_name(tensor))
       if shape is not None:
-        tensor.set_shape(shapes[tensor.name])
+        tensor.set_shape(shape)
 
 
 def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
diff --git a/tensorflow/contrib/lite/python/convert_saved_model_test.py b/tensorflow/contrib/lite/python/convert_saved_model_test.py
index 80e5dc6e46..1e570d2c89 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model_test.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model_test.py
@@ -73,10 +73,15 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-    convert_saved_model.set_tensor_shapes([tensor],
-                                          {"Placeholder:0": [5, 3, 5]})
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
     self.assertEqual([5, 3, 5], tensor.shape.as_list())
 
+  def testSetTensorShapeNoneValid(self):
+    tensor = array_ops.placeholder(dtype=dtypes.float32)
+
+    convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
+    self.assertEqual([1, 3, 5], tensor.shape.as_list())
+
   def testSetTensorShapeInvalid(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
-- 
GitLab


From bedf4eeb1361ef1483d9a0a6575f8c74d2eee572 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Mon, 4 Jun 2018 14:26:09 -0700
Subject: [PATCH 293/610] Fixing raspberry pi file for conflict.

---
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh          | 3 ---
 .../tools/ci_build/windows/cpu/pip/build_tf_windows.sh      | 4 ++++
 tools/bazel.rc                                              | 6 ------
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index cbd4a93e6d..4d1a30601e 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -102,9 +102,6 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
-  --distinct_host_configuration=true \
-  //tensorflow:libtensorflow.so \
-  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 73520bb2ac..f4a0b232ec 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -73,6 +73,10 @@ if [[ "$release_build" != 1 ]]; then
   echo "build --define=override_eigen_strong_inline=true" >> "${TMP_BAZELRC}"
 fi
 
+# The host and target platforms are the same in Windows build. So we don't have
+# to distinct them. This helps avoid building the same targets twice.
+echo "build --distinct_host_configuration=false" >> "${TMP_BAZELRC}"
+
 echo "import %workspace%/${TMP_BAZELRC}" >> .bazelrc
 
 run_configure_for_cpu_build
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 03aa52da1f..1c1e6afb65 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -1,14 +1,8 @@
-# By default, we don't distinct target and host platfroms.
-# When doing cross compilation, use --config=cross_compile to distinct them.
-build --distinct_host_configuration=false
-build:cross_compile --distinct_host_configuration=true
-
 # Android configs. Bazel needs to have --cpu and --fat_apk_cpu both set to the
 # target CPU to build transient dependencies correctly. See
 # https://docs.bazel.build/versions/master/user-manual.html#flag--fat_apk_cpu
 build:android --crosstool_top=//external:android/crosstool
 build:android --host_crosstool_top=@bazel_tools//tools/cpp:toolchain
-build:android --config=cross_compile
 build:android_arm --config=android
 build:android_arm --cpu=armeabi-v7a
 build:android_arm --fat_apk_cpu=armeabi-v7a
-- 
GitLab


From fedfc47ca6713adbbf82e10d4803c5fe94234bbd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 4 Jun 2018 21:37:43 -0700
Subject: [PATCH 294/610] Resolve device names when passed into
 DistributionStrategy methods.

PiperOrigin-RevId: 199241723
---
 .../contrib/distribute/python/combinations.py | 26 +++++++++----------
 .../distribute/python/mirrored_strategy.py    |  9 ++++---
 .../contrib/distribute/python/values.py       |  7 ++---
 3 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index e400fa5be2..98e7228f24 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -46,9 +46,9 @@ import unittest
 from absl.testing import parameterized
 import six
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import one_device_strategy
-from tensorflow.contrib.distribute.python import tpu_strategy
+from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
+from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
+from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
 from tensorflow.python.eager import context
@@ -289,9 +289,9 @@ class NamedObject(object):
 class NamedDistribution(object):
   """Translates DistributionStrategy and its data into a good name."""
 
-  def __init__(self, name, distribution, required_gpus=None,
+  def __init__(self, name, distribution_fn, required_gpus=None,
                required_tpu=False):
-    self._distribution = distribution
+    self._distribution_fn = distribution_fn
     self._name = name
     self._required_gpus = required_gpus
     self._required_tpu = required_tpu
@@ -301,7 +301,7 @@ class NamedDistribution(object):
 
   @property
   def strategy(self):
-    return self._distribution
+    return self._distribution_fn()
 
   @property
   def required_gpus(self):
@@ -312,29 +312,29 @@ class NamedDistribution(object):
     return self._required_tpu
 
 
+# pylint: disable=g-long-lambda
 default_strategy = NamedDistribution(
     "Default",
-    distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
+    lambda: distribute_lib._default_distribution_strategy,  # pylint: disable=protected-access
     required_gpus=None)
 one_device_strategy = NamedDistribution(
-    "OneDeviceCPU", one_device_strategy.OneDeviceStrategy("/cpu:0"),
+    "OneDeviceCPU", lambda: one_device_lib.OneDeviceStrategy("/cpu:0"),
     required_gpus=None)
 tpu_strategy_single_iteration = NamedDistribution(
     "TPUSingleIteration",
-    tpu_strategy.TPUStrategy(iterations_per_step=1),
+    lambda: tpu_lib.TPUStrategy(iterations_per_step=1),
     required_tpu=True)
-tpu_strategy = NamedDistribution(
-    "TPU", tpu_strategy.TPUStrategy(), required_tpu=True)
+tpu_strategy = NamedDistribution("TPU", tpu_lib.TPUStrategy, required_tpu=True)
 # Note that we disable prefetching for testing since prefetching makes
 # the input non-deterministic.
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
-    mirrored_strategy.MirroredStrategy(
+    lambda: mirrored_lib.MirroredStrategy(
         ["/gpu:0", "/cpu:0"], prefetch_on_device=False),
     required_gpus=1)
 mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
-    mirrored_strategy.MirroredStrategy(
+    lambda: mirrored_lib.MirroredStrategy(
         ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
     required_gpus=2)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 14dbbd6e27..6eadba976b 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -84,9 +84,8 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     assert len(set(devices)) == len(devices), (
         "No duplicates allowed in `devices` argument.")
     # TODO(josh11b): Require at least 2 devices?
-    self._devices = devices
-    self._canonical_device_set = set(
-        [device_util.canonicalize(d) for d in devices])
+    self._devices = [device_util.resolve(d) for d in devices]
+    self._canonical_device_set = set(self._devices)
     self._device_index = values.PerDevice(
         dict((d, i) for i, d in enumerate(devices)))
     self._cross_tower_ops = cross_tower_ops
@@ -400,7 +399,9 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       # pylint: disable=protected-access
       return list(colocate_with._index.keys())
     elif isinstance(colocate_with, six.string_types):
-      return [colocate_with]
+      return [device_util.resolve(colocate_with)]
+    elif isinstance(colocate_with, list):
+      return [device_util.resolve(d) for d in colocate_with]
     else:
       return colocate_with
 
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/contrib/distribute/python/values.py
index 49b4e24daa..9572ade8e4 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/contrib/distribute/python/values.py
@@ -65,9 +65,10 @@ class DistributedValues(object):
     device = device_util.canonicalize(device)
     try:
       return self._index[device]
-    except KeyError:
-      raise ValueError("Device %s not found in %s (current device %s)" %
-                       (device, self._index.keys(), device_util.current()))
+    except KeyError as e:
+      six.raise_from(
+          ValueError("Device %s not found in %s (current device %s)" %
+                     (device, self._index.keys(), device_util.current())), e)
 
   def on_device(self, device):
     device = device_util.canonicalize(device)
-- 
GitLab


From d660ab0c392562be89f02400e492bd54a7f9d6b0 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Mon, 4 Jun 2018 22:09:11 -0700
Subject: [PATCH 295/610] [TF:XLA] Add method CreateNewModule to
 HloVerifiedTestBase, and remember all created modules, to verify at TearDown.

PiperOrigin-RevId: 199244092
---
 .../xla/service/algebraic_simplifier_test.cc  | 47 +++++++++----------
 .../xla/tests/hlo_verified_test_base.cc       | 20 +++++---
 .../xla/tests/hlo_verified_test_base.h        | 16 ++++++-
 3 files changed, 51 insertions(+), 32 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index cda157f9fa..27eb48181e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1714,7 +1714,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1759,7 +1759,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -1781,7 +1781,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1804,7 +1804,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1932,7 +1932,8 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     b.AddInstruction(HloInstruction::CreateConvolve(out_shape, input, filter,
                                                     window, dnums));
 
-    auto module = CreateNewModule();
+    // TODO(b/80488902): verify this module.
+    auto module = HloTestBase::CreateNewModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
     AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
@@ -2060,7 +2061,7 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2090,7 +2091,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2121,7 +2122,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2151,7 +2152,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
@@ -2184,7 +2185,7 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2200,10 +2201,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
       HloInstruction::CreateParameter(0, r0f32, "scalar_param"));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
-  HloInstruction* broadcast =
-      builder.AddInstruction(HloInstruction::CreateBroadcast(
-          broadcast_shape, scalar_param,
-          AsInt64Slice(broadcast_shape.dimensions())));
+  HloInstruction* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(broadcast_shape, scalar_param, {}));
 
   Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -2219,10 +2218,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
 
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2237,10 +2236,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6});
-  HloInstruction* broadcast =
-      builder.AddInstruction(HloInstruction::CreateBroadcast(
-          broadcast_shape, forty_two,
-          AsInt64Slice(broadcast_shape.dimensions())));
+  HloInstruction* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(broadcast_shape, forty_two, {}));
 
   HloInstruction* transpose =
       builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -2259,7 +2256,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2268,7 +2265,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
-  auto module = CreateNewModule();
+  // TODO(b/80488902): verify this module.
+  auto module = HloTestBase::CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2349,7 +2347,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
-  auto module = CreateNewModule();
+  // TODO(b/80488902): verify this module.
+  auto module = HloTestBase::CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2444,7 +2443,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index c8a05c2e9e..22c664d142 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -41,14 +41,17 @@ void HloVerifiedTestBase::TearDown() {
       << "TearDown called more than once; it should be called exactly once.";
   tear_down_called_ = true;
   if (module_) {
-    VerifyModule();
+    VerifyModule(module_.get());
+  }
+  for (int i = 0; i < modules_.size(); ++i) {
+    VerifyModule(modules_.at(i).get());
   }
   HloTestBase::TearDown();
 }
 
-void HloVerifiedTestBase::VerifyModule() {
-  HloVerifier verifier;
-  xla::StatusOr<bool> mutated = verifier.Run(module_.get());
+void HloVerifiedTestBase::VerifyModule(HloModule* module) {
+  HloVerifier verifier(/*allow_mixed_precision=*/true);
+  xla::StatusOr<bool> mutated = verifier.Run(module);
   if (!mutated.ok()) {
     ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
   } else {
@@ -59,15 +62,20 @@ void HloVerifiedTestBase::VerifyModule() {
 
 HloModule& HloVerifiedTestBase::module() {
   if (!module_) {
-    module_ = CreateNewModule();
+    module_ = HloTestBase::CreateNewModule();
   }
   return *module_;
 }
 
+HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
+  modules_.emplace_back(HloTestBase::CreateNewModule());
+  return modules_.back().get();
+}
+
 void HloVerifiedTestBase::ParseAndVerifyModule(
     tensorflow::StringPiece hlo_text) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
   TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
-  VerifyModule();
+  VerifyModule(module_.get());
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index e5bb14a883..5b59cc77f6 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -52,11 +52,23 @@ class HloVerifiedTestBase : public HloTestBase {
     shape_verifier_ = std::move(shape_verifier);
   }
 
+  // Creates a new module for a test, and stores it in modules_ so it can be
+  // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
+  // creation of unverified modules.
+  HloModule* CreateNewModule(const string& name = TestName());
+
+  // It is confusing to store modules created by module() and CreateNewModule()
+  // in different fields, but it allows us to migrate tests to
+  // HloVerifiedTestBase more easily, so it's a win because we can verify more
+  // modules. See b/80488902.
  private:
-  std::unique_ptr<HloModule> module_;  // Lazily populated. Access via module().
+  // Lazily populated. Access via module().
+  std::unique_ptr<HloModule> module_;
+  // Populated by calls to CreateNewModule.
+  std::vector<std::unique_ptr<HloModule>> modules_;
   std::unique_ptr<ShapeVerifier> shape_verifier_;
   bool tear_down_called_ = false;
-  void VerifyModule();
+  static void VerifyModule(HloModule* module);
 };
 
 }  // namespace xla
-- 
GitLab


From bf8d058ccaf30bc05bce5d4b13133d14aca42dfe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 01:00:50 -0700
Subject: [PATCH 296/610] Windows: Refactor bazel_test_lib.sh and common_env.sh

- Removed workaround for https://github.com/bazelbuild/bazel/issues/2182 since it's fixed
- Removed setting CUDA related environment variables. Assume they are already set. If not,
  configure.py will set default values for them.
- Removed obsolete variables for cc_test targets.

PiperOrigin-RevId: 199256482
---
 .../ci_build/windows/bazel/bazel_test_lib.sh  | 116 +-----------------
 .../ci_build/windows/bazel/common_env.sh      |   5 -
 2 files changed, 3 insertions(+), 118 deletions(-)

diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index 582188fc00..a3e07737a4 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -14,130 +14,20 @@
 # limitations under the License.
 # ==============================================================================
 #
-# C++ tests
-failing_cpu_cc_tests="\
-    //tensorflow/core/kernels:control_flow_ops_test + \
-    //tensorflow/core:example_example_parser_configuration_test + \
-    //tensorflow/core:lib_core_status_test + \
-    //tensorflow/core:lib_monitoring_collection_registry_test + \
-    //tensorflow/core:lib_strings_numbers_test + \
-    //tensorflow/core/platform/hadoop:hadoop_file_system_test + \
-    //tensorflow/core:platform_file_system_test + \
-    //tensorflow/core:platform_logging_test + \
-    //tensorflow/core:util_sparse_sparse_tensor_test + \
-    //tensorflow/cc:framework_gradient_checker_test + \
-    //tensorflow/cc:framework_gradients_test + \
-    //tensorflow/cc:gradients_array_grad_test + \
-    //tensorflow/cc:gradients_math_grad_test + \
-    //tensorflow/cc:gradients_nn_grad_test + \
-    //tensorflow/cc/saved_model:loader_test \
-"
-
-broken_cpu_cc_tests="\
-    //tensorflow/cc:framework_cc_ops_test + \
-    //tensorflow/core/platform/cloud:time_util_test + \
-    //tensorflow/core/platform/cloud:oauth_client_test + \
-    //tensorflow/core/platform/cloud:http_request_test + \
-    //tensorflow/core/platform/cloud:google_auth_provider_test + \
-    //tensorflow/core/platform/cloud:gcs_file_system_test + \
-    //tensorflow/core/kernels/cloud:bigquery_table_accessor_test + \
-    //tensorflow/core/kernels/hexagon:graph_transferer_test + \
-    //tensorflow/core/kernels:remote_fused_graph_execute_utils_test + \
-    //tensorflow/core/kernels:requantize_op_test + \
-    //tensorflow/core/kernels:requantization_range_op_test + \
-    //tensorflow/core/kernels:quantized_reshape_op_test + \
-    //tensorflow/core/kernels:quantized_pooling_ops_test + \
-    //tensorflow/core/kernels:quantized_matmul_op_test + \
-    //tensorflow/core/kernels:quantized_conv_ops_test + \
-    //tensorflow/core/kernels:quantized_concat_op_test + \
-    //tensorflow/core/kernels:quantized_bias_add_op_test + \
-    //tensorflow/core/kernels:quantized_batch_norm_op_test + \
-    //tensorflow/core/kernels:quantized_activation_ops_test + \
-    //tensorflow/core/kernels:quantize_op_test + \
-    //tensorflow/core/kernels:quantize_down_and_shrink_range_op_test + \
-    //tensorflow/core/kernels:quantize_and_dequantize_op_test_gpu + \
-    //tensorflow/core/kernels:quantize_and_dequantize_op_test + \
-    //tensorflow/core/kernels:quantization_utils_test + \
-    //tensorflow/core/kernels:debug_ops_test + \
-    //tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr_test_gpu + \
-    //tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr_test + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_tensor_coding_test + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_session_test_gpu + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_session_test + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_channel_test_gpu + \
-    //tensorflow/core/distributed_runtime/rpc:grpc_channel_test + \
-    //tensorflow/core/distributed_runtime:remote_device_test_gpu + \
-    //tensorflow/core/distributed_runtime:remote_device_test + \
-    //tensorflow/core/distributed_runtime:executor_test_gpu + \
-    //tensorflow/core/distributed_runtime:executor_test + \
-    //tensorflow/core/debug:debug_gateway_test + \
-    //tensorflow/core/debug:debug_grpc_io_utils_test + \
-    //tensorflow/core:util_reporter_test + \
-    //tensorflow/core:util_memmapped_file_system_test + \
-    //tensorflow/core:platform_subprocess_test + \
-    //tensorflow/core:platform_profile_utils_cpu_utils_test + \
-    //tensorflow/core:lib_jpeg_jpeg_mem_unittest + \
-    //tensorflow/core/debug:debug_io_utils_test \
-"
-
-# lib_core_threadpool_test is timeout, but it passes when running alone
-extra_failing_gpu_cc_tests="\
-    //tensorflow/core:lib_core_threadpool_test + \
-    //tensorflow/core:cuda_libdevice_path_test + \
-    //tensorflow/core:common_runtime_direct_session_test + \
-    //tensorflow/core:common_runtime_direct_session_with_tracking_alloc_test + \
-    //tensorflow/core:device_tracer_test + \
-    //tensorflow/core:ops_math_grad_test \
-"
-
-exclude_cpu_cc_tests="${failing_cpu_cc_tests} + ${broken_cpu_cc_tests}"
-
-exclude_gpu_cc_tests="${extra_failing_gpu_cc_tests} + ${exclude_cpu_cc_tests}"
 
 function run_configure_for_cpu_build {
-  # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
-  # yes "" | ./configure doesn't work on Windows, so we set all the
-  # environment variables in advance to avoid interact with the script.
-  export TF_NEED_CUDA=0
-  if [ -z "$TF_ENABLE_XLA" ]; then
-    export TF_ENABLE_XLA=0
-  fi
-  if [ -z "$TF_NEED_MKL" ]; then
-    export TF_NEED_MKL=0
-  fi
-  export TF_NEED_VERBS=0
-  export TF_NEED_GCP=1
-  export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL_SYCL=0
-  echo "" | ./configure
+  yes "" | ./configure
 }
 
 function run_configure_for_gpu_build {
-  # Due to a bug in Bazel: https://github.com/bazelbuild/bazel/issues/2182
-  # yes "" | ./configure doesn't work on Windows, so we set all the
-  # environment variables in advance to avoid interact with the script.
+  # Enable CUDA support
   export TF_NEED_CUDA=1
-  export TF_CUDA_VERSION=9.0
-  export CUDA_TOOLKIT_PATH="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0"
-  export TF_CUDNN_VERSION=7.0
-  if [ -z "$CUDNN_INSTALL_PATH" ]; then
-    export CUDNN_INSTALL_PATH="C:/tools/cuda"
-  fi
-  export TF_CUDA_COMPUTE_CAPABILITIES="3.7"
-  if [ -z "$TF_ENABLE_XLA" ]; then
-    export TF_ENABLE_XLA=0
-  fi
-  export TF_NEED_VERBS=0
-  export TF_NEED_MKL=0
-  export TF_NEED_GCP=0
-  export TF_NEED_HDFS=0
-  export TF_NEED_OPENCL_SYCL=0
 
   # TODO(pcloudy): Remove this after TensorFlow uses its own CRSOOTOOL
   # for GPU build on Windows
   export USE_MSVC_WRAPPER=1
 
-  echo "" | ./configure
+  yes "" | ./configure
 }
 
 function set_gcs_remote_cache_options {
diff --git a/tensorflow/tools/ci_build/windows/bazel/common_env.sh b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
index 0e6c0227b7..eefa8ee2d5 100644
--- a/tensorflow/tools/ci_build/windows/bazel/common_env.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/common_env.sh
@@ -49,8 +49,3 @@ export PATH="/c/Program Files/Git/cmd:$PATH"
 
 # Make sure we have pip in PATH
 export PATH="/c/${PYTHON_BASE_PATH}/Scripts:$PATH"
-
-# Add Cuda and Cudnn dll directories into PATH
-export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/bin:$PATH"
-export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v9.0/extras/CUPTI/libx64:$PATH"
-export PATH="/c/tools/cuda/bin:$PATH"
-- 
GitLab


From 540333664e90cd64afd99df24bda374368682a60 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 01:57:19 -0700
Subject: [PATCH 297/610] Added missing backtick in tf.ones_like documentation

PiperOrigin-RevId: 199262414
---
 tensorflow/python/ops/array_ops.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 3c4946ae5f..8129334703 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -1623,7 +1623,7 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Args:
     tensor: A `Tensor`.
     dtype: A type for the returned `Tensor`. Must be `float32`, `float64`,
-      `int8`, `uint8`, `int16`, `uint16`, int32`, `int64`,
+      `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
       `complex64`, `complex128` or `bool`.
     name: A name for the operation (optional).
     optimize: if true, attempt to statically determine the shape of 'tensor'
-- 
GitLab


From 92789d7a76cfd599c597d4639135241ff9988ef0 Mon Sep 17 00:00:00 2001
From: Tom Hennigan <tomhennigan@google.com>
Date: Tue, 5 Jun 2018 03:56:47 -0700
Subject: [PATCH 298/610] Handle scalar input to assert_equal in eager.

PiperOrigin-RevId: 199274329
---
 tensorflow/python/kernel_tests/check_ops_test.py | 7 +++++++
 tensorflow/python/ops/check_ops.py               | 4 ++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 5a83ec8d30..7ef841c96b 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -88,6 +88,13 @@ class AssertEqualTest(test.TestCase):
       out = array_ops.identity(small)
     self.evaluate(out)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def test_scalar_comparison(self):
+    const_true = constant_op.constant(True, name="true")
+    const_false = constant_op.constant(False, name="false")
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
+      check_ops.assert_equal(const_true, const_false, message="fail")
+
   def test_returns_none_with_eager(self):
     with context.eager_mode():
       small = constant_op.constant([1, 2], name="small")
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index cabc1e724c..375a5ec2c3 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -341,8 +341,8 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
                           y_sum, y_np[:y_sum]))
 
         index_and_values_str = ''
-        if x.shape == y.shape:
-          # If the shapes of x and y are the same,
+        if x.shape == y.shape and x.shape.as_list():
+          # If the shapes of x and y are the same (and not scalars),
           # Get the values that actually differed and their indices.
           # If shapes are different this information is more confusing
           # than useful.
-- 
GitLab


From 22a8c240d59a173ff3f17ffda05b521aa3f222de Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 5 Jun 2018 07:27:58 -0700
Subject: [PATCH 299/610] Remove test dependencies that are no longer needed.

PiperOrigin-RevId: 199293694
---
 .../contrib/autograph/converters/control_flow_test.py     | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/control_flow_test.py b/tensorflow/contrib/autograph/converters/control_flow_test.py
index 1a863590f9..9d23d9b5b7 100644
--- a/tensorflow/contrib/autograph/converters/control_flow_test.py
+++ b/tensorflow/contrib/autograph/converters/control_flow_test.py
@@ -42,7 +42,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.while_loop) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual((10, 5, 5),
                          sess.run(result.test_fn(constant_op.constant(5))))
@@ -57,7 +57,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.while_loop) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual(0, sess.run(result.test_fn(constant_op.constant(5))))
 
@@ -75,7 +75,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.cond) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual((-1, 0),
                          sess.run(result.test_fn(constant_op.constant(1))))
@@ -92,7 +92,7 @@ class ControlFlowTest(converter_test_base.TestCase):
     node = self.parse_and_analyze(test_fn, {})
     node = control_flow.transform(node, self.ctx)
 
-    with self.compiled(node, control_flow_ops.cond) as result:
+    with self.compiled(node) as result:
       with self.test_session() as sess:
         self.assertEqual(-1, sess.run(result.test_fn(constant_op.constant(1))))
 
-- 
GitLab


From c0dc76a3994c743151404b1401599fefb9f37dd4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 07:54:24 -0700
Subject: [PATCH 300/610] Fix generated_zip_test failure caused by regex
 matching failures.

PiperOrigin-RevId: 199296333
---
 .../testing/generated_examples_zip_test.cc    | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
index 2f069ff8e7..e85020448a 100644
--- a/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/contrib/lite/testing/generated_examples_zip_test.cc
@@ -48,7 +48,7 @@ tensorflow::Env* env = tensorflow::Env::Default();
 // TODO(ahentz): make sure we clean this list up frequently.
 std::map<string, string> kBrokenTests = {
     // Add only supports float32. (and "constant" tests use Add)
-    {R"(^\/adda.*int32)", "68808744"},
+    {R"(^\/add_a.*int32)", "68808744"},
     {R"(^\/constant.*int32)", "68808744"},
     {R"(^\/mul.*int32)", "68808744"},
     {R"(^\/div.*int32)", "68808744"},
@@ -61,25 +61,25 @@ std::map<string, string> kBrokenTests = {
      "70527055"},
 
     // L2Norm only supports tensors with 4D or fewer.
-    {R"(^\/l2normdim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
+    {R"(^\/l2norm_dim=.*,epsilon=.*,input_shape=\[.,.,.,.,.*\])", "67963684"},
 
     // SpaceToBatchND only supports 4D tensors.
     {R"(^\/space_to_batch_nd.*input_shape=\[1,4,4,4,1,1\])", "70848787"},
 
     // L2Norm only works for dim=-1.
-    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
-    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
-    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
-    {R"(^\/l2normdim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
-    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])",
+    {R"(^\/l2norm_dim=-2,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2norm_dim=0,epsilon=.*,input_shape=\[.,.\])", "67963812"},
+    {R"(^\/l2norm_dim=-2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=-2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=2,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=2,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=0,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=0,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=1,epsilon=.*,input_shape=\[3,15,14,3\])", "67963812"},
+    {R"(^\/l2norm_dim=1,epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=\[2,3\],epsilon=.*,input_shape=\[3,15,14,3\])",
      "67963812"},
-    {R"(^\/l2normdim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
+    {R"(^\/l2norm_dim=\[2,3\],epsilon=.*,input_shape=\[1,3,4,3\])", "67963812"},
 
     // ResizeBilinear looks completely incompatible with Tensorflow
     {R"(^\/resize_bilinear.*dtype=tf.int32)", "72401107"},
-- 
GitLab


From 274f9510f68f237589df5c6a414e4b8e5ebcdba1 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 5 Jun 2018 08:13:07 -0700
Subject: [PATCH 301/610] Remove _USE_C_API staging from ops.py.

PiperOrigin-RevId: 199298594
---
 .../copy_graph/python/util/copy_elements.py   |   1 -
 tensorflow/contrib/graph_editor/transform.py  |   5 +-
 tensorflow/python/framework/ops.py            | 544 +++++-------------
 tensorflow/python/framework/ops_test.py       |   3 -
 4 files changed, 160 insertions(+), 393 deletions(-)

diff --git a/tensorflow/contrib/copy_graph/python/util/copy_elements.py b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
index 102bc460fd..a0dd3881a8 100644
--- a/tensorflow/contrib/copy_graph/python/util/copy_elements.py
+++ b/tensorflow/contrib/copy_graph/python/util/copy_elements.py
@@ -218,7 +218,6 @@ def copy_op_to_graph(org_instance, to_graph, variables, scope=''):
                            new_control_inputs, input_types, new_original_op,
                            op_def)
     #Use Graph's hidden methods to add the op
-    to_graph._add_op(new_op)  # pylint: disable=protected-access
     to_graph._record_op_seen_by_control_dependencies(new_op)
     for device_function in reversed(to_graph._device_function_stack):
       new_op._set_device(device_function(new_op))
diff --git a/tensorflow/contrib/graph_editor/transform.py b/tensorflow/contrib/graph_editor/transform.py
index 592d37b432..026a3d1200 100644
--- a/tensorflow/contrib/graph_editor/transform.py
+++ b/tensorflow/contrib/graph_editor/transform.py
@@ -189,9 +189,6 @@ def copy_op_handler(info, op, new_inputs, copy_shape=True, nodedef_fn=None):
   if op._original_op:
     op_._original_op = op._original_op
 
-  # Add op to the graph
-  info.graph_._add_op(op_)
-
   return op_, op_.outputs
 
 
@@ -492,7 +489,7 @@ class Transformer(object):
       t_ = info.transformed_ts[t]
       consumer_op_ = info.transformed_ops[consumer_op]
       t_index_ = list(consumer_op_.inputs).index(tmp_t_)
-      consumer_op_._update_input(t_index_, t_, update_dtype=False)  # pylint: disable=protected-access
+      consumer_op_._update_input(t_index_, t_)  # pylint: disable=protected-access
 
   def _connect_control_inputs(self, info):
     """Connect the previously copied ops."""
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index eceea5276a..b2fd98f431 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -56,6 +56,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import decorator_utils
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -288,15 +289,8 @@ class Tensor(_TensorLike):
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
 
-    if _USE_C_API:
-      # This will be set by set_shape_and_handle_data_for_outputs.
-      self._shape_val = None
-    else:
-      # The Python code requires all tensors start with a shape to support shape
-      # inference on imported while loops. This isn't necessary with the C API
-      # enabled because the C API provides the shapes for imported nodes.
-      # TODO(skyewm): remove when _USE_C_API is removed.
-      self._shape_val = tensor_shape.unknown_shape()
+    # This will be set by self.shape().
+    self._shape_val = None
 
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
@@ -384,7 +378,6 @@ class Tensor(_TensorLike):
       if _USE_C_SHAPES:
         self._shape_val = self._c_api_shape()
       else:
-        assert _USE_C_API
         # Call set_shape_and_handle_data_for_outputs in topological order on all
         # ops that are needed to compute self.op's shape. We do this instead of
         # having set_shape_and_handle_data_for_outputs recursively call
@@ -508,8 +501,6 @@ class Tensor(_TensorLike):
     else:
       self._shape_val = self.shape.merge_with(shape)
 
-    if not self._op._graph._c_graph: return
-
     # Update C shape even if _USE_C_SHAPES = False, since we still want
     # set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
@@ -545,33 +536,14 @@ class Tensor(_TensorLike):
     Returns:
       A list of `Operation`s.
     """
-    if self._op._c_op:  # pylint: disable=protected-access
-      consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
-          self._as_tf_output())
-      # pylint: disable=protected-access
-      return [
-          self.graph._get_operation_by_name_unsafe(name)
-          for name in consumer_names
-      ]
-      # pylint: enable=protected-access
-    else:
-      return self._consumers
-
-  def _add_consumer(self, consumer):
-    """Add a consumer to this tensor.
-
-    Args:
-      consumer: an Operation.
-
-    Raises:
-      TypeError: if the consumer is not an Operation.
-    """
+    consumer_names = c_api.TF_OperationOutputConsumers_wrapper(
+        self._as_tf_output())
     # pylint: disable=protected-access
-    assert not self._op._c_op, "Tensor._add_consumer doesn't work with C API"
+    return [
+        self.graph._get_operation_by_name_unsafe(name)
+        for name in consumer_names
+    ]
     # pylint: enable=protected-access
-    if not isinstance(consumer, Operation):
-      raise TypeError("Consumer must be an Operation: %s" % consumer)
-    self._consumers.append(consumer)
 
   def _as_node_def_input(self):
     """Return a value to use for the NodeDef "input" attribute.
@@ -594,7 +566,6 @@ class Tensor(_TensorLike):
 
   def _as_tf_output(self):
     # pylint: disable=protected-access
-    assert self.op._c_op
     return c_api_util.tf_output(self.op._c_op, self.value_index)
     # pylint: enable=protected-access
 
@@ -1722,18 +1693,8 @@ class Operation(object):
                           "a Tensor, or IndexedSlices: %s" % c)
         control_input_ops.append(control_op)
 
-    # Don't set private fields with C API enabled to catch users who need to
-    # switch to public API.
-    # TODO(skyewm): delete these fields once we remove _USE_C_API
-    if not self._graph._c_graph:
-      self._inputs_val = list(inputs)  # Defensive copy.
-      self._input_types_val = input_types
-      self._control_inputs_val = control_input_ops
-      self._node_def_val = copy.deepcopy(node_def)
-      self._op_def_val = op_def
-    else:
-      # This will be set by self.inputs.
-      self._inputs_val = None
+    # This will be set by self.inputs.
+    self._inputs_val = None
 
     self._id_value = self._graph._next_id()  # pylint: disable=protected-access
     self._original_op = original_op
@@ -1742,10 +1703,8 @@ class Operation(object):
 
     # Initialize self._c_op.
     if c_op:
-      # TODO(skyewm): remove this assert when we remove USE_C_API
-      assert self._graph._c_graph  # pylint: disable=protected-access
       self._c_op = c_op
-    elif self._graph._c_graph:  # pylint: disable=protected-access
+    else:
       if op_def is None:
         op_def = self._graph._get_op_def(node_def.op)
       # TODO(skyewm): op_def_library.apply_op() flattens the incoming inputs.
@@ -1754,30 +1713,19 @@ class Operation(object):
           op_def, inputs, node_def.attr)
       self._c_op = _create_c_op(self._graph, node_def, grouped_inputs,
                                 control_input_ops)
-    else:
-      self._c_op = None
-
-    # Mark that we consume the inputs. This is unnecessary and unsupported with
-    # the C API enabled, since the C API tracks the tensor consumers instead.
-    if not self._c_op:
-      for input_tensor in self._inputs_val:
-        input_tensor._add_consumer(self)  # pylint: disable=protected-access
 
     # Initialize self._outputs.
-    if self._c_op:
-      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
-      output_types = [
-          c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
-          for i in range(num_outputs)]
-      assert output_types is not None
-    elif output_types is None:
-      output_types = []
-    self._output_types_val = output_types
+    num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+    output_types = [
+        c_api.TF_OperationOutputType(c_api_util.tf_output(self._c_op, i))
+        for i in range(num_outputs)]
     self._outputs = [
         Tensor(self, i, output_type)
         for i, output_type in enumerate(output_types)
     ]
 
+    self._graph._add_op(self)  # pylint: disable=protected-access
+
     if not c_op:
       self._control_flow_post_processing()
 
@@ -1791,7 +1739,6 @@ class Operation(object):
       control_flow_util.CheckInputFromValidContext(self, input_tensor.op)
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
-    self._recompute_node_def()
 
   def _reconstruct_sequence_inputs(self, op_def, inputs, attrs):
     """Regroups a flat list of input tensors into scalar and sequence inputs.
@@ -1872,10 +1819,7 @@ class Operation(object):
   @property
   def name(self):
     """The full name of this operation."""
-    if self._c_op:
-      return c_api.TF_OperationName(self._c_op)
-    else:
-      return self._node_def_val.name
+    return c_api.TF_OperationName(self._c_op)
 
   @property
   def _id(self):
@@ -1891,10 +1835,7 @@ class Operation(object):
       assigned, or an empty string if it has not been assigned to a
       device.
     """
-    if self._c_op:
-      return c_api.TF_OperationDevice(self._c_op)
-    else:
-      return self._node_def_val.device
+    return c_api.TF_OperationDevice(self._c_op)
 
   @property
   def _output_types(self):
@@ -1907,28 +1848,21 @@ class Operation(object):
       The length of this list indicates the number of output endpoints
       of the operation.
     """
-    if self._c_op:
-      num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
-      output_types = [
-          c_api.TF_OperationOutputType(self._tf_output(i))
-          for i in xrange(num_outputs)
-      ]
-      # TODO(iga): Remove this assert after converting to C API by default.
-      # Just being a bit paranoid here.
-      assert self._output_types_val == output_types
-      # In all the tests we have output_types that are passed into
-      # Operation.__init__ are a list of ints (which is illegal according
-      # to the docstring), but input_types are instances of DType.
-      # This extra assert is to catch if we ever use DType for output_types.
-      if output_types:
-        assert isinstance(output_types[0], int)
-      return output_types
-    else:
-      return self._output_types_val
+    num_outputs = c_api.TF_OperationNumOutputs(self._c_op)
+    output_types = [
+        c_api.TF_OperationOutputType(self._tf_output(i))
+        for i in xrange(num_outputs)
+    ]
+    # In all the tests we have output_types that are passed into
+    # Operation.__init__ are a list of ints (which is illegal according
+    # to the docstring), but input_types are instances of DType.
+    # This extra assert is to catch if we ever use DType for output_types.
+    if output_types:
+      assert isinstance(output_types[0], int)
+    return output_types
 
   def _tf_output(self, output_idx):
     """Create and return a new TF_Output for output_idx'th output of this op."""
-    assert self._c_op
     tf_output = c_api.TF_Output()
     tf_output.oper = self._c_op
     tf_output.index = output_idx
@@ -1936,7 +1870,6 @@ class Operation(object):
 
   def _tf_input(self, input_idx):
     """Create and return a new TF_Input for input_idx'th input of this op."""
-    assert self._c_op
     tf_input = c_api.TF_Input()
     tf_input.oper = self._c_op
     tf_input.index = input_idx
@@ -1948,47 +1881,12 @@ class Operation(object):
     Args:
       device: string or device..  The device to set.
     """
-    if self._c_op:
-      c_api.SetRequestedDevice(
-          self._graph._c_graph,  # pylint: disable=protected-access
-          self._c_op,  # pylint: disable=protected-access
-          compat.as_str(_device_string(device)))
-    else:
-      self._node_def_val.device = _device_string(device)
-
-  def _add_input(self, tensor, dtype=None):
-    """Add a new input to this operation.
-
-    Args:
-      tensor: the Tensor to add as an input.
-      dtype: tf.DType: type of the input; defaults to
-        the tensor's dtype.
+    c_api.SetRequestedDevice(
+        self._graph._c_graph,  # pylint: disable=protected-access
+        self._c_op,  # pylint: disable=protected-access
+        compat.as_str(_device_string(device)))
 
-    Raises:
-      TypeError: if tensor is not a Tensor,
-        or if input tensor type is not convertible to dtype.
-      ValueError: if the Tensor is from a different graph.
-    """
-    assert not self._c_op, (
-        "Operation._add_input doesn't work with C API")
-    if not isinstance(tensor, Tensor):
-      raise TypeError("tensor must be a Tensor: %s" % tensor)
-    _assert_same_graph(self, tensor)
-    if dtype is None:
-      dtype = tensor.dtype
-    else:
-      dtype = dtypes.as_dtype(dtype)
-      if not dtype.is_compatible_with(tensor.dtype):
-        raise TypeError(
-            "Cannot convert a tensor of type %s to an input of type %s" %
-            (tensor.dtype.name, dtype.name))
-    self._inputs_val.append(tensor)
-    self._input_types_val.append(dtype)
-    tensor._add_consumer(self)  # pylint: disable=protected-access
-    self._recompute_node_def()
-
-  # TODO(skyewm): Remove `update_dtype` when we enable the C API.
-  def _update_input(self, index, tensor, update_dtype=True):
+  def _update_input(self, index, tensor):
     """Update the input to this operation at the given index.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1996,7 +1894,6 @@ class Operation(object):
     Args:
       index: the index of the input to update.
       tensor: the Tensor to be used as the input at the given index.
-      update_dtype: If `False`, the type for this input is not updated.
 
     Raises:
       TypeError: if tensor is not a Tensor,
@@ -2013,20 +1910,12 @@ class Operation(object):
     if not _USE_C_SHAPES:
       set_shape_and_handle_data_for_outputs(self)
 
-    if self._c_op:
-      # Reset cached inputs.
-      self._inputs_val = None
-      c_api.UpdateEdge(
-          self._graph._c_graph,  # pylint: disable=protected-access
-          tensor._as_tf_output(),  # pylint: disable=protected-access
-          self._tf_input(index))
-    else:
-      self._inputs_val[index].consumers().remove(self)
-      self._inputs_val[index] = tensor
-      if update_dtype:
-        self._input_types_val[index] = tensor.dtype
-      tensor._add_consumer(self)  # pylint: disable=protected-access
-      self._recompute_node_def()
+    # Reset cached inputs.
+    self._inputs_val = None
+    c_api.UpdateEdge(
+        self._graph._c_graph,  # pylint: disable=protected-access
+        tensor._as_tf_output(),  # pylint: disable=protected-access
+        self._tf_input(index))
 
   def _add_control_inputs(self, ops):
     """Add a list of new control inputs to this operation.
@@ -2038,19 +1927,10 @@ class Operation(object):
       TypeError: if ops is not a list of Operations.
       ValueError: if any op in ops is from a different graph.
     """
-    if self._c_op:
-      for op in ops:
-        if not isinstance(op, Operation):
-          raise TypeError("op must be an Operation: %s" % op)
-        c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
-    else:
-      if ops:
-        for op in ops:
-          if not isinstance(op, Operation):
-            raise TypeError("op must be an Operation: %s" % op)
-          _assert_same_graph(self, op)
-          self._control_inputs_val.append(op)
-        self._recompute_node_def()
+    for op in ops:
+      if not isinstance(op, Operation):
+        raise TypeError("op must be an Operation: %s" % op)
+      c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
 
   def _add_control_input(self, op):
     """Add a new control input to this operation.
@@ -2062,33 +1942,13 @@ class Operation(object):
       TypeError: if op is not an Operation.
       ValueError: if op is from a different graph.
     """
-    if self._c_op:
-      if not isinstance(op, Operation):
-        raise TypeError("op must be an Operation: %s" % op)
-      c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
-    else:
-      self._add_control_inputs([op])
+    if not isinstance(op, Operation):
+      raise TypeError("op must be an Operation: %s" % op)
+    c_api.AddControlInput(self._graph._c_graph, self._c_op, op._c_op)  # pylint: disable=protected-access
 
   def _remove_all_control_inputs(self):
     """Removes any control inputs to this operation."""
-    if self._c_op:
-      c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
-    else:
-      del self.control_inputs[:]
-
-  # Methods below are used when building the NodeDef and Graph proto.
-  def _recompute_node_def(self):
-    # TODO(skyewm): remove this function when we switch to C API
-    if self._c_op: return
-
-    del self._node_def_val.input[:]
-    # pylint: disable=protected-access
-    self._node_def_val.input.extend(
-        [t._as_node_def_input() for t in self._inputs_val])
-    # pylint: enable=protected-access
-    if self._control_inputs_val:
-      self._node_def_val.input.extend(
-          ["^%s" % op.name for op in self._control_inputs_val])
+    c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
 
   def __str__(self):
     return str(self.node_def)
@@ -2129,19 +1989,16 @@ class Operation(object):
   @property
   def inputs(self):
     """The list of `Tensor` objects representing the data inputs of this op."""
-    if self._c_op:
-      if self._inputs_val is None:
-        tf_outputs = c_api.GetOperationInputs(self._c_op)
-        # pylint: disable=protected-access
-        retval = [
-            self.graph._get_tensor_by_tf_output(tf_output)
-            for tf_output in tf_outputs
-        ]
-        # pylint: enable=protected-access
-        self._inputs_val = Operation._InputList(retval)
-      return self._inputs_val
-    else:
-      return Operation._InputList(self._inputs_val)
+    if self._inputs_val is None:
+      tf_outputs = c_api.GetOperationInputs(self._c_op)
+      # pylint: disable=protected-access
+      retval = [
+          self.graph._get_tensor_by_tf_output(tf_output)
+          for tf_output in tf_outputs
+      ]
+      # pylint: enable=protected-access
+      self._inputs_val = Operation._InputList(retval)
+    return self._inputs_val
 
   @property
   def _inputs(self):
@@ -2155,15 +2012,12 @@ class Operation(object):
 
   @property
   def _input_types(self):
-    if self._c_op:
-      num_inputs = c_api.TF_OperationNumInputs(self._c_op)
-      input_types = [
-          dtypes.as_dtype(c_api.TF_OperationInputType(self._tf_input(i)))
-          for i in xrange(num_inputs)
-      ]
-      return input_types
-    else:
-      return self._input_types_val
+    num_inputs = c_api.TF_OperationNumInputs(self._c_op)
+    input_types = [
+        dtypes.as_dtype(c_api.TF_OperationInputType(self._tf_input(i)))
+        for i in xrange(num_inputs)
+    ]
+    return input_types
 
   @_input_types.setter
   def _input_types(self, value):
@@ -2183,16 +2037,13 @@ class Operation(object):
       A list of `Operation` objects.
 
     """
-    if self._c_op:
-      control_c_ops = c_api.TF_OperationGetControlInputs_wrapper(self._c_op)
-      # pylint: disable=protected-access
-      return [
-          self.graph._get_operation_by_name_unsafe(
-              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
-      ]
-      # pylint: enable=protected-access
-    else:
-      return self._control_inputs_val
+    control_c_ops = c_api.TF_OperationGetControlInputs_wrapper(self._c_op)
+    # pylint: disable=protected-access
+    return [
+        self.graph._get_operation_by_name_unsafe(
+            c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+    ]
+    # pylint: enable=protected-access
 
   @property
   def _control_outputs(self):
@@ -2205,18 +2056,13 @@ class Operation(object):
       A list of `Operation` objects.
 
     """
-    if self._c_op:
-      control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
-      # pylint: disable=protected-access
-      return [
-          self.graph._get_operation_by_name_unsafe(
-              c_api.TF_OperationName(c_op)) for c_op in control_c_ops
-      ]
-      # pylint: enable=protected-access
-    else:
-      # TODO(apassos) this should be less inefficient.
-      return [o for o in self._graph.get_operations()
-              if self in o.control_inputs]
+    control_c_ops = c_api.TF_OperationGetControlOutputs_wrapper(self._c_op)
+    # pylint: disable=protected-access
+    return [
+        self.graph._get_operation_by_name_unsafe(
+            c_api.TF_OperationName(c_op)) for c_op in control_c_ops
+    ]
+    # pylint: enable=protected-access
 
   @property
   def _control_inputs(self):
@@ -2240,11 +2086,7 @@ class Operation(object):
   @property
   def type(self):
     """The type of the op (e.g. `"MatMul"`)."""
-    if self._c_op:
-      op_type = c_api.TF_OperationOpType(self._c_op)
-      return op_type
-    else:
-      return self._node_def_val.op
+    return c_api.TF_OperationOpType(self._c_op)
 
   @property
   def graph(self):
@@ -2262,15 +2104,12 @@ class Operation(object):
       protocol buffer.
     """
     # pylint: enable=line-too-long
-    if self._c_op:
-      with c_api_util.tf_buffer() as buf:
-        c_api.TF_OperationToNodeDef(self._c_op, buf)
-        data = c_api.TF_GetBuffer(buf)
-      node_def = node_def_pb2.NodeDef()
-      node_def.ParseFromString(compat.as_bytes(data))
-      return node_def
-    else:
-      return self._node_def_val
+    with c_api_util.tf_buffer() as buf:
+      c_api.TF_OperationToNodeDef(self._c_op, buf)
+      data = c_api.TF_GetBuffer(buf)
+    node_def = node_def_pb2.NodeDef()
+    node_def.ParseFromString(compat.as_bytes(data))
+    return node_def
 
   @property
   def _node_def(self):
@@ -2289,10 +2128,7 @@ class Operation(object):
       protocol buffer.
     """
     # pylint: enable=line-too-long
-    if self._c_op:
-      return self._graph._get_op_def(self.type)
-    else:
-      return self._op_def_val
+    return self._graph._get_op_def(self.type)
 
   @property
   def _op_def(self):
@@ -2318,17 +2154,14 @@ class Operation(object):
 
   def _set_attr(self, attr_name, attr_value):
     """Private method used to set an attribute in the node_def."""
-    if self._c_op:
-      buf = c_api.TF_NewBufferFromString(
-          compat.as_bytes(attr_value.SerializeToString()))
-      try:
-        # pylint: disable=protected-access
-        c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
-        # pylint: enable=protected-access
-      finally:
-        c_api.TF_DeleteBuffer(buf)
-    else:
-      self._node_def_val.attr[attr_name].CopyFrom(attr_value)
+    buf = c_api.TF_NewBufferFromString(
+        compat.as_bytes(attr_value.SerializeToString()))
+    try:
+      # pylint: disable=protected-access
+      c_api.SetAttr(self._graph._c_graph, self._c_op, attr_name, buf)
+      # pylint: enable=protected-access
+    finally:
+      c_api.TF_DeleteBuffer(buf)
 
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
@@ -2343,21 +2176,15 @@ class Operation(object):
       ValueError: If this op does not have an attr with the given `name`.
     """
     fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
-    if self._c_op:
-      try:
-        with c_api_util.tf_buffer() as buf:
-          c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
-          data = c_api.TF_GetBuffer(buf)
-      except errors.InvalidArgumentError as e:
-        # Convert to ValueError for backwards compatibility.
-        raise ValueError(str(e))
-      x = attr_value_pb2.AttrValue()
-      x.ParseFromString(data)
-    else:
-      if name not in self._node_def_val.attr:
-        raise ValueError(
-            "No attr named '" + name + "' in " + str(self._node_def_val))
-      x = self._node_def_val.attr[name]
+    try:
+      with c_api_util.tf_buffer() as buf:
+        c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
+        data = c_api.TF_GetBuffer(buf)
+    except errors.InvalidArgumentError as e:
+      # Convert to ValueError for backwards compatibility.
+      raise ValueError(str(e))
+    x = attr_value_pb2.AttrValue()
+    x.ParseFromString(data)
 
     # Treat an empty oneof value as an empty list.
     if not x.WhichOneof("value"):
@@ -2577,9 +2404,9 @@ def _set_shape_and_handle_data_for_outputs_c_api(op):
 def set_shape_and_handle_data_for_outputs(op):
   """Set the shapes and resource handle data for op's outputs.
 
-  When _USE_C_API = True, this is lazily called when a tensor's shape is first
-  requested. Usually this should work automatically, but some edge cases may
-  require manually calling this first to make sure Tensor._shape_val and
+  When _USE_C_SHAPES = False, this is lazily called when a tensor's shape is
+  first requested. Usually this should work automatically, but some edge cases
+  may require manually calling this first to make sure Tensor._shape_val and
   Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
   Tensor).
   """
@@ -3083,15 +2910,12 @@ class Graph(object):
       A `VersionDef`.
     """
     # pylint: enable=line-too-long
-    if self._c_graph:
-      with c_api_util.tf_buffer() as buf:
-        c_api.TF_GraphVersions(self._c_graph, buf)
-        data = c_api.TF_GetBuffer(buf)
-      version_def = versions_pb2.VersionDef()
-      version_def.ParseFromString(compat.as_bytes(data))
-      return version_def
-    else:
-      return self._graph_def_versions
+    with c_api_util.tf_buffer() as buf:
+      c_api.TF_GraphVersions(self._c_graph, buf)
+      data = c_api.TF_GetBuffer(buf)
+    version_def = versions_pb2.VersionDef()
+    version_def.ParseFromString(compat.as_bytes(data))
+    return version_def
 
   @property
   def seed(self):
@@ -3185,40 +3009,22 @@ class Graph(object):
 
     """
     # pylint: enable=line-too-long
-    if self._c_graph:
-      with self._lock:
-        with c_api_util.tf_buffer() as buf:
-          c_api.TF_GraphToGraphDef(self._c_graph, buf)
-          data = c_api.TF_GetBuffer(buf)
-        graph = graph_pb2.GraphDef()
-        graph.ParseFromString(compat.as_bytes(data))
-        # Strip the experimental library field iff it's empty.
-        if not graph.library.function:
-          graph.ClearField("library")
-
-        if add_shapes:
-          for node in graph.node:
-            op = self._nodes_by_name[node.name]
-            if op.outputs:
-              node.attr["_output_shapes"].list.shape.extend(
-                  [output.get_shape().as_proto() for output in op.outputs])
-    else:
-      with self._lock:
-        graph = graph_pb2.GraphDef()
-        graph.versions.CopyFrom(self._graph_def_versions)
-        bytesize = 0
-        for op_id in sorted(self._nodes_by_id):
-          op = self._nodes_by_id[op_id]
-          if from_version is None or op_id > from_version:
-            graph.node.extend([op.node_def])
-            if op.outputs and add_shapes:
-              assert "_output_shapes" not in graph.node[-1].attr
-              graph.node[-1].attr["_output_shapes"].list.shape.extend(
-                  [output.get_shape().as_proto() for output in op.outputs])
-            bytesize += op.node_def.ByteSize()
-            if bytesize >= (1 << 31) or bytesize < 0:
-              raise ValueError("GraphDef cannot be larger than 2GB.")
-        self._copy_functions_to_graph_def(graph, bytesize)
+    with self._lock:
+      with c_api_util.tf_buffer() as buf:
+        c_api.TF_GraphToGraphDef(self._c_graph, buf)
+        data = c_api.TF_GetBuffer(buf)
+      graph = graph_pb2.GraphDef()
+      graph.ParseFromString(compat.as_bytes(data))
+      # Strip the experimental library field iff it's empty.
+      if not graph.library.function:
+        graph.ClearField("library")
+
+      if add_shapes:
+        for node in graph.node:
+          op = self._nodes_by_name[node.name]
+          if op.outputs:
+            node.attr["_output_shapes"].list.shape.extend(
+                [output.get_shape().as_proto() for output in op.outputs])
     return graph, self._version
 
   def as_graph_def(self, from_version=None, add_shapes=False):
@@ -3292,34 +3098,16 @@ class Graph(object):
 
     # Add function to graph
     # pylint: disable=protected-access
-    if self._c_graph:
-      # Handle functions created without using the C API. TODO(apassos,skyewm)
-      # remove this when all functions are generated using the C API by default
-      # as this will be unnecessary.
-      if not function._c_func:
-        serialized = function.definition.SerializeToString()
-        c_func = c_api.TF_FunctionImportFunctionDef(serialized)
-        function._c_func = c_api_util.ScopedTFFunction(c_func)
-      gradient = (function._grad_func._c_func.func if function._grad_func
-                  else None)
-      c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
-    else:
-      # If there is already a function with the same name, raise an error
-      # if bodies are different. Else, do nothing. The C API version above
-      # has the same behavior.
-      previous = self._functions.get(name, None)
-      if previous:
-        # This check is not ideal as we can have a hash collision with only
-        # 32 bits in the hash, but the non C API mode is being deprecated.
-        # Don't bother changing it now.
-        if previous._hash_str == function._hash_str:
-          return
-        else:
-          raise ValueError("Cannot add function (%s, hash %s) to graph (%s). "
-                           "Another function (%s, hash %s) is already defined "
-                           "with that name (%s)" % (
-                               function, function._hash_str, self,
-                               previous, previous._hash_str, name))
+    # Handle functions created without using the C API. TODO(apassos,skyewm)
+    # remove this when all functions are generated using the C API by default
+    # as this will be unnecessary.
+    if not function._c_func:
+      serialized = function.definition.SerializeToString()
+      c_func = c_api.TF_FunctionImportFunctionDef(serialized)
+      function._c_func = c_api_util.ScopedTFFunction(c_func)
+    gradient = (function._grad_func._c_func.func if function._grad_func
+                else None)
+    c_api.TF_GraphCopyFunction(self._c_graph, function._c_func.func, gradient)
     # pylint: enable=protected-access
 
     self._functions[name] = function
@@ -3334,6 +3122,9 @@ class Graph(object):
     return self._building_function
 
   # Helper functions to create operations.
+  @deprecated_args(None,
+                   "Shapes are always computed; don't use the compute_shapes "
+                   "as it has no effect.", "compute_shapes")
   def create_op(
       self,
       op_type,
@@ -3370,8 +3161,8 @@ class Graph(object):
         proto).
       op_def: (Optional.) The `OpDef` proto that describes the `op_type` that
         the operation will have.
-      compute_shapes: (Optional.) If True, shape inference will be performed
-        to compute the shapes of the outputs.
+      compute_shapes: (Optional.) Deprecated. Has no effect (shapes are always
+        computed).
       compute_device: (Optional.) If True, device functions will be executed
         to compute the device property of the Operation.
 
@@ -3381,8 +3172,9 @@ class Graph(object):
 
     Returns:
       An `Operation` object.
-
     """
+    del compute_shapes
+
     self._check_not_finalized()
     for idx, a in enumerate(inputs):
       if not isinstance(a, Tensor):
@@ -3412,18 +3204,7 @@ class Graph(object):
           input_types=input_types,
           original_op=self._default_original_op,
           op_def=op_def)
-
-      # Note: shapes are lazily computed with the C API enabled.
-      #
-      # TODO(skyewm): unlike in the original Python implementation, the C API
-      # always computes shape information (even for function calls, which the
-      # original Python shape inference code doesn't handle). Deprecate the
-      # compute_shapes argument.
-      if not _USE_C_API and compute_shapes:
-        set_shape_and_handle_data_for_outputs(ret)
-
-      self._create_op_helper(ret, compute_shapes=compute_shapes,
-                             compute_device=compute_device)
+      self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
   def _create_op_from_tf_operation(self, c_op, compute_device=True):
@@ -3457,11 +3238,8 @@ class Graph(object):
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _create_op_helper(self, op, compute_shapes=True, compute_device=True):
+  def _create_op_helper(self, op, compute_device=True):
     """Common logic for creating an op in this graph."""
-    # TODO(b/XXXX): move to Operation.__init__ once _USE_C_API flag is removed.
-    self._add_op(op)
-
     # Apply any additional attributes requested. Do not overwrite any existing
     # attributes.
     for key, value in self._attr_scope_map.items():
@@ -3528,8 +3306,7 @@ class Graph(object):
     # (2) "is_stateful" is set in OpDef
     # (3) "container" attribute is in OpDef
     # (4) "container" attribute is None
-    # TODO(skyewm): remove op.op_def check when _USE_C_API is removed.
-    if self._container and op.op_def and op.op_def.is_stateful:
+    if self._container and op.op_def.is_stateful:
       try:
         container_attr = op.get_attr("container")
       except ValueError:
@@ -3816,17 +3593,14 @@ class Graph(object):
 
   def _get_op_def(self, type):  # pylint: disable=redefined-builtin
     """Returns the `OpDef` proto for `type`. `type` is a string."""
-    if self._c_graph:
-      with c_api_util.tf_buffer() as buf:
-        # pylint: disable=protected-access
-        c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
-        # pylint: enable=protected-access
-        data = c_api.TF_GetBuffer(buf)
-      op_def = op_def_pb2.OpDef()
-      op_def.ParseFromString(compat.as_bytes(data))
-      return op_def
-    else:
-      return self._registered_ops[type]
+    with c_api_util.tf_buffer() as buf:
+      # pylint: disable=protected-access
+      c_api.TF_GraphGetOpDef(self._c_graph, compat.as_bytes(type), buf)
+      # pylint: enable=protected-access
+      data = c_api.TF_GetBuffer(buf)
+    op_def = op_def_pb2.OpDef()
+    op_def.ParseFromString(compat.as_bytes(data))
+    return op_def
 
   def as_default(self):
     """Returns a context manager that makes this `Graph` the default graph.
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index e7732632f2..81355a279c 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -270,7 +270,6 @@ class OperationTest(test_util.TensorFlowTestCase):
     op1 = ops.Operation(
         ops._NodeDef("RefOutputFloatOutput", "op1"), g, [],
         [dtypes.float32_ref, dtypes.float32])
-    g._add_op(op1)
     self.assertProtoEquals("op:'RefOutputFloatOutput' name:'op1'", op1.node_def)
     self.assertEquals([], list(op1.inputs))
     ref_t, nonref_t = op1.values()
@@ -279,14 +278,12 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("RefInputFloatInput", "op2"),
         g, [ref_t, nonref_t], [],
         input_types=[dtypes.float32_ref, dtypes.float32])
-    g._add_op(op2)
     self.assertProtoEquals(
         "op:'RefInputFloatInput' name:'op2' input:'op1' input:'op1:1'",
         op2.node_def)
     self.assertEquals([ref_t, nonref_t], list(op2.inputs))
     op3 = ops.Operation(
         ops._NodeDef("TwoFloatInputs", "op3"), g, [ref_t, nonref_t], [])
-    g._add_op(op3)
     self.assertProtoEquals(
         "op:'TwoFloatInputs' name:'op3' input:'op1' input:'op1:1'",
         op3.node_def)
-- 
GitLab


From 3653e80488f490ad744410a92ac287acf7035bda Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 5 Jun 2018 08:20:41 -0700
Subject: [PATCH 302/610] Address compiler warnings in
 tensorflow/core/distributed_runtime.

PiperOrigin-RevId: 199299538
---
 tensorflow/core/distributed_runtime/local_master.h        | 2 +-
 tensorflow/core/distributed_runtime/master.cc             | 8 ++++----
 tensorflow/core/distributed_runtime/master_session.cc     | 7 +++----
 .../core/distributed_runtime/rpc/grpc_worker_service.cc   | 4 ++--
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/tensorflow/core/distributed_runtime/local_master.h b/tensorflow/core/distributed_runtime/local_master.h
index cad6babad8..b9c76d0f1d 100644
--- a/tensorflow/core/distributed_runtime/local_master.h
+++ b/tensorflow/core/distributed_runtime/local_master.h
@@ -79,7 +79,7 @@ class LocalMaster : public MasterInterface {
                      RunCallableResponse* response) override;
   Status ReleaseCallable(CallOptions* call_options,
                          const ReleaseCallableRequest* request,
-                         ReleaseCallableResponse* response);
+                         ReleaseCallableResponse* response) override;
 
   // Registers the mapping from the given `target` to the given `master`.
   //
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 4f9d84d158..a48f734d3e 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -473,7 +473,7 @@ void Master::PartialRunSetup(const PartialRunSetupRequest* req,
     return;
   }
 
-  SchedClosure([this, session, req, resp, done]() {
+  SchedClosure([session, req, resp, done]() {
     Status s = session->PartialRunSetup(req, resp);
     session->Unref();
     done(s);
@@ -628,7 +628,7 @@ void Master::MakeCallable(const MakeCallableRequest* req,
   }
 
   SchedClosure(std::bind(
-      [this, session, req, resp](MyClosure done) {
+      [session, req, resp](MyClosure done) {
         Status s = session->MakeCallable(*req, resp);
         session->Unref();
         done(s);
@@ -645,7 +645,7 @@ void Master::RunCallable(CallOptions* opts, const RunCallableRequest* req,
   }
 
   SchedClosure(std::bind(
-      [this, session, opts, req, resp](MyClosure done) {
+      [session, opts, req, resp](MyClosure done) {
         Status s = session->RunCallable(opts, *req, resp);
         session->Unref();
         done(s);
@@ -662,7 +662,7 @@ void Master::ReleaseCallable(const ReleaseCallableRequest* req,
   }
 
   SchedClosure(std::bind(
-      [this, session, req, resp](MyClosure done) {
+      [session, req, resp](MyClosure done) {
         Status s = session->ReleaseCallable(*req, resp);
         session->Unref();
         done(s);
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index bd70eca3f6..e29bb76ddf 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -156,8 +156,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
         LoggingResponse* resp = new LoggingResponse;
         p.worker->LoggingAsync(
             &req, resp,
-            [step_id, ss, resp, &scoped_mu, &waiting_for,
-             &all_done](const Status& s) {
+            [step_id, ss, resp, &scoped_mu, &all_done](const Status& s) {
               {
                 mutex_lock l(scoped_mu);
                 if (s.ok()) {
@@ -1207,7 +1206,7 @@ Status MasterSession::CreateWorkerSessions(
   std::vector<WorkerGroup> workers(worker_names.size());
 
   // Release the workers.
-  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
+  auto cleanup = gtl::MakeCleanup([&workers, worker_cache] {
     for (auto&& worker_group : workers) {
       if (worker_group.worker != nullptr) {
         worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
@@ -1289,7 +1288,7 @@ Status MasterSession::DeleteWorkerSessions() {
   std::vector<WorkerGroup> workers(worker_names.size());
 
   // Release the workers.
-  auto cleanup = gtl::MakeCleanup([this, &workers, worker_cache] {
+  auto cleanup = gtl::MakeCleanup([&workers, worker_cache] {
     for (auto&& worker_group : workers) {
       if (worker_group.worker != nullptr) {
         worker_cache->ReleaseWorker(*worker_group.name, worker_group.worker);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 2e7b111963..aa9304a033 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -513,8 +513,8 @@ void GrpcWorker::RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
   CollectiveRemoteAccess* rma = ce_handle.get()->remote_access();
   rma->buf_rendezvous()->ConsumeBuf(
       request->buf_rendezvous_key(),
-      [this, opts, request, response, done](const Status& status,
-                                            BufRendezvous::Hook* hook) {
+      [this, request, response, done](const Status& status,
+                                      BufRendezvous::Hook* hook) {
         Status s = status;
         if (s.ok()) {
           if (!DMAHelper::CanUseDMA(hook->prod_value)) {
-- 
GitLab


From e1f31d40b9d12e687100a689bc5439d78702124c Mon Sep 17 00:00:00 2001
From: Tom Hennigan <tomhennigan@google.com>
Date: Tue, 5 Jun 2018 08:42:28 -0700
Subject: [PATCH 303/610] Expose `@tfe.run_all_tests_in_graph_and_eager_modes`.

PiperOrigin-RevId: 199302255
---
 tensorflow/contrib/eager/python/tfe.py   | 1 +
 tensorflow/python/framework/test_util.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tensorflow/contrib/eager/python/tfe.py b/tensorflow/contrib/eager/python/tfe.py
index 5826700c73..fee9db46fa 100644
--- a/tensorflow/contrib/eager/python/tfe.py
+++ b/tensorflow/contrib/eager/python/tfe.py
@@ -115,6 +115,7 @@ from tensorflow.python.eager.execution_callbacks import seterr
 from tensorflow.python.framework.ops import enable_eager_execution
 from tensorflow.python.framework.ops import eager_run as run
 from tensorflow.python.framework.test_util import run_in_graph_and_eager_modes as run_test_in_graph_and_eager_modes
+from tensorflow.python.framework.test_util import run_all_in_graph_and_eager_modes as run_all_tests_in_graph_and_eager_modes
 from tensorflow.python.ops.custom_gradient import custom_gradient
 from tensorflow.python.ops.resource_variable_ops import ResourceVariable as Variable
 from tensorflow.python.ops.variable_scope import EagerVariableStore
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index b56483f373..0c06d9aa41 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -644,6 +644,7 @@ def assert_no_garbage_created(f):
 
 
 def run_all_in_graph_and_eager_modes(cls):
+  """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes()
   for name, value in cls.__dict__.copy().items():
     if callable(value) and name.startswith("test"):
-- 
GitLab


From 51445a754dd3d6f3a7b2e89b8d02d0f467c36b63 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 09:16:39 -0700
Subject: [PATCH 304/610] Add computed receptive field parameters from popular
 convnets.

PiperOrigin-RevId: 199306977
---
 tensorflow/contrib/receptive_field/README.md  |  32 +-
 .../receptive_field/RECEPTIVE_FIELD_TABLE.md  | 629 ++++++++++++++++++
 .../util/examples/csv_to_markdown_table.py    |  82 +++
 3 files changed, 740 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md
 create mode 100644 tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py

diff --git a/tensorflow/contrib/receptive_field/README.md b/tensorflow/contrib/receptive_field/README.md
index 3ff85faf61..79b015a916 100644
--- a/tensorflow/contrib/receptive_field/README.md
+++ b/tensorflow/contrib/receptive_field/README.md
@@ -6,6 +6,32 @@ region your output features depend on. Better yet, using the parameters computed
 by the library, you can easily find the exact image region which is used to
 compute each convnet feature.
 
+This library can be used to compute receptive field parameters of popular
+convnets:
+
+<center>
+
+convnet model       | receptive field | effective stride | effective padding
+:-----------------: | :-------------: | :--------------: | :---------------:
+alexnet_v2          | 195             | 32               | 64
+vgg_16              | 212             | 32               | 90
+inception_v2        | 699             | 32               | 318
+inception_v3        | 1311            | 32               | 618
+inception_v4        | 2071            | 32               | 998
+inception_resnet_v2 | 3039            | 32               | 1482
+mobilenet_v1        | 315             | 32               | 126
+mobilenet_v1_075    | 315             | 32               | 126
+resnet_v1_50        | 483             | 32               | 241
+resnet_v1_101       | 1027            | 32               | 513
+resnet_v1_152       | 1507            | 32               | 753
+resnet_v1_200       | 1763            | 32               | 881
+
+</center>
+
+A comprehensive table with pre-computed receptive field parameters for different
+end-points, input resolutions, and other variants of these networks can be found
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md).
+
 ## Basic usage
 
 The main function to be called is `compute_receptive_field_from_graph_def`,
@@ -96,9 +122,9 @@ The script will write to stdout the receptive field parameters for many variants
 of several popular convnets: AlexNet, VGG, ResNet, Inception, Mobilenet. They
 are also written to the file `/tmp/rf_benchmark_results.csv`.
 
-TODO: include here a plot for receptive field sizes of different convnets.
-
-TODO: include table/link to pre-computed RF parameters.
+A comprehensive table with pre-computed receptive field parameters for different
+networks can be found
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md).
 
 ## Compute RF parameters from a graph pbtxt
 
diff --git a/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md b/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md
new file mode 100644
index 0000000000..736fbef6e7
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/RECEPTIVE_FIELD_TABLE.md
@@ -0,0 +1,629 @@
+# Pre-computed receptive field parameters
+
+## Table with results
+
+The table below presents the receptive field parameters for several popular
+convolutional neural networks. These are computed using the models from the
+[TF-Slim
+repository](https://github.com/tensorflow/models/tree/master/research/slim),
+by using the [rf_benchmark
+script](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/python/util/examples/rf_benchmark.py).
+
+Questions? See the [FAQ](#faq).
+
+CNN                            | resolution | end-point            | RF   | effective stride | effective padding
+:----------------------------: | :--------: | :------------------: | :--: | :--------------: | :---------------:
+alexnet_v2                     | None       | alexnet_v2/conv1     | 11   | 4                | 0
+alexnet_v2                     | None       | alexnet_v2/pool1     | 19   | 8                | 0
+alexnet_v2                     | None       | alexnet_v2/conv2     | 51   | 8                | 16
+alexnet_v2                     | None       | alexnet_v2/conv3     | 99   | 16               | 32
+alexnet_v2                     | None       | alexnet_v2/conv4     | 131  | 16               | 48
+alexnet_v2                     | None       | alexnet_v2/conv5     | 163  | 16               | 64
+alexnet_v2                     | None       | alexnet_v2/pool5     | 195  | 32               | 64
+alexnet_v2                     | 224        | alexnet_v2/conv1     | 11   | 4                | 0
+alexnet_v2                     | 224        | alexnet_v2/pool1     | 19   | 8                | 0
+alexnet_v2                     | 224        | alexnet_v2/conv2     | 51   | 8                | 16
+alexnet_v2                     | 224        | alexnet_v2/conv3     | 99   | 16               | 32
+alexnet_v2                     | 224        | alexnet_v2/conv4     | 131  | 16               | 48
+alexnet_v2                     | 224        | alexnet_v2/conv5     | 163  | 16               | 64
+alexnet_v2                     | 224        | alexnet_v2/pool5     | 195  | 32               | 64
+alexnet_v2                     | 321        | alexnet_v2/conv1     | 11   | 4                | 0
+alexnet_v2                     | 321        | alexnet_v2/pool1     | 19   | 8                | 0
+alexnet_v2                     | 321        | alexnet_v2/conv2     | 51   | 8                | 16
+alexnet_v2                     | 321        | alexnet_v2/conv3     | 99   | 16               | 32
+alexnet_v2                     | 321        | alexnet_v2/conv4     | 131  | 16               | 48
+alexnet_v2                     | 321        | alexnet_v2/conv5     | 163  | 16               | 64
+alexnet_v2                     | 321        | alexnet_v2/pool5     | 195  | 32               | 64
+vgg_a                          | None       | vgg_a/conv1/conv1_1  | 3    | 1                | 1
+vgg_a                          | None       | vgg_a/pool1          | 4    | 2                | 1
+vgg_a                          | None       | vgg_a/conv2/conv2_1  | 8    | 2                | 3
+vgg_a                          | None       | vgg_a/pool2          | 10   | 4                | 3
+vgg_a                          | None       | vgg_a/conv3/conv3_1  | 18   | 4                | 7
+vgg_a                          | None       | vgg_a/conv3/conv3_2  | 26   | 4                | 11
+vgg_a                          | None       | vgg_a/pool3          | 30   | 8                | 11
+vgg_a                          | None       | vgg_a/conv4/conv4_1  | 46   | 8                | 19
+vgg_a                          | None       | vgg_a/conv4/conv4_2  | 62   | 8                | 27
+vgg_a                          | None       | vgg_a/pool4          | 70   | 16               | 27
+vgg_a                          | None       | vgg_a/conv5/conv5_1  | 102  | 16               | 43
+vgg_a                          | None       | vgg_a/conv5/conv5_2  | 134  | 16               | 59
+vgg_a                          | None       | vgg_a/pool5          | 150  | 32               | 59
+vgg_a                          | 224        | vgg_a/conv1/conv1_1  | 3    | 1                | 1
+vgg_a                          | 224        | vgg_a/pool1          | 4    | 2                | 1
+vgg_a                          | 224        | vgg_a/conv2/conv2_1  | 8    | 2                | 3
+vgg_a                          | 224        | vgg_a/pool2          | 10   | 4                | 3
+vgg_a                          | 224        | vgg_a/conv3/conv3_1  | 18   | 4                | 7
+vgg_a                          | 224        | vgg_a/conv3/conv3_2  | 26   | 4                | 11
+vgg_a                          | 224        | vgg_a/pool3          | 30   | 8                | 11
+vgg_a                          | 224        | vgg_a/conv4/conv4_1  | 46   | 8                | 19
+vgg_a                          | 224        | vgg_a/conv4/conv4_2  | 62   | 8                | 27
+vgg_a                          | 224        | vgg_a/pool4          | 70   | 16               | 27
+vgg_a                          | 224        | vgg_a/conv5/conv5_1  | 102  | 16               | 43
+vgg_a                          | 224        | vgg_a/conv5/conv5_2  | 134  | 16               | 59
+vgg_a                          | 224        | vgg_a/pool5          | 150  | 32               | 59
+vgg_a                          | 321        | vgg_a/conv1/conv1_1  | 3    | 1                | 1
+vgg_a                          | 321        | vgg_a/pool1          | 4    | 2                | 1
+vgg_a                          | 321        | vgg_a/conv2/conv2_1  | 8    | 2                | 3
+vgg_a                          | 321        | vgg_a/pool2          | 10   | 4                | 3
+vgg_a                          | 321        | vgg_a/conv3/conv3_1  | 18   | 4                | 7
+vgg_a                          | 321        | vgg_a/conv3/conv3_2  | 26   | 4                | 11
+vgg_a                          | 321        | vgg_a/pool3          | 30   | 8                | 11
+vgg_a                          | 321        | vgg_a/conv4/conv4_1  | 46   | 8                | 19
+vgg_a                          | 321        | vgg_a/conv4/conv4_2  | 62   | 8                | 27
+vgg_a                          | 321        | vgg_a/pool4          | 70   | 16               | 27
+vgg_a                          | 321        | vgg_a/conv5/conv5_1  | 102  | 16               | 43
+vgg_a                          | 321        | vgg_a/conv5/conv5_2  | 134  | 16               | 59
+vgg_a                          | 321        | vgg_a/pool5          | 150  | 32               | 59
+vgg_16                         | None       | vgg_16/conv1/conv1_1 | 3    | 1                | 1
+vgg_16                         | None       | vgg_16/pool1         | 6    | 2                | 2
+vgg_16                         | None       | vgg_16/conv2/conv2_1 | 10   | 2                | 4
+vgg_16                         | None       | vgg_16/pool2         | 16   | 4                | 6
+vgg_16                         | None       | vgg_16/conv3/conv3_1 | 24   | 4                | 10
+vgg_16                         | None       | vgg_16/conv3/conv3_2 | 32   | 4                | 14
+vgg_16                         | None       | vgg_16/pool3         | 44   | 8                | 18
+vgg_16                         | None       | vgg_16/conv4/conv4_1 | 60   | 8                | 26
+vgg_16                         | None       | vgg_16/conv4/conv4_2 | 76   | 8                | 34
+vgg_16                         | None       | vgg_16/pool4         | 100  | 16               | 42
+vgg_16                         | None       | vgg_16/conv5/conv5_1 | 132  | 16               | 58
+vgg_16                         | None       | vgg_16/conv5/conv5_2 | 164  | 16               | 74
+vgg_16                         | None       | vgg_16/pool5         | 212  | 32               | 90
+vgg_16                         | 224        | vgg_16/conv1/conv1_1 | 3    | 1                | 1
+vgg_16                         | 224        | vgg_16/pool1         | 6    | 2                | 2
+vgg_16                         | 224        | vgg_16/conv2/conv2_1 | 10   | 2                | 4
+vgg_16                         | 224        | vgg_16/pool2         | 16   | 4                | 6
+vgg_16                         | 224        | vgg_16/conv3/conv3_1 | 24   | 4                | 10
+vgg_16                         | 224        | vgg_16/conv3/conv3_2 | 32   | 4                | 14
+vgg_16                         | 224        | vgg_16/pool3         | 44   | 8                | 18
+vgg_16                         | 224        | vgg_16/conv4/conv4_1 | 60   | 8                | 26
+vgg_16                         | 224        | vgg_16/conv4/conv4_2 | 76   | 8                | 34
+vgg_16                         | 224        | vgg_16/pool4         | 100  | 16               | 42
+vgg_16                         | 224        | vgg_16/conv5/conv5_1 | 132  | 16               | 58
+vgg_16                         | 224        | vgg_16/conv5/conv5_2 | 164  | 16               | 74
+vgg_16                         | 224        | vgg_16/pool5         | 212  | 32               | 90
+vgg_16                         | 321        | vgg_16/conv1/conv1_1 | 3    | 1                | 1
+vgg_16                         | 321        | vgg_16/pool1         | 6    | 2                | 2
+vgg_16                         | 321        | vgg_16/conv2/conv2_1 | 10   | 2                | 4
+vgg_16                         | 321        | vgg_16/pool2         | 16   | 4                | 6
+vgg_16                         | 321        | vgg_16/conv3/conv3_1 | 24   | 4                | 10
+vgg_16                         | 321        | vgg_16/conv3/conv3_2 | 32   | 4                | 14
+vgg_16                         | 321        | vgg_16/pool3         | 44   | 8                | 18
+vgg_16                         | 321        | vgg_16/conv4/conv4_1 | 60   | 8                | 26
+vgg_16                         | 321        | vgg_16/conv4/conv4_2 | 76   | 8                | 34
+vgg_16                         | 321        | vgg_16/pool4         | 100  | 16               | 42
+vgg_16                         | 321        | vgg_16/conv5/conv5_1 | 132  | 16               | 58
+vgg_16                         | 321        | vgg_16/conv5/conv5_2 | 164  | 16               | 74
+vgg_16                         | 321        | vgg_16/pool5         | 212  | 32               | 90
+inception_v2                   | None       | Conv2d_1a_7x7        | 7    | 2                | None
+inception_v2                   | None       | MaxPool_2a_3x3       | 11   | 4                | None
+inception_v2                   | None       | Conv2d_2b_1x1        | 11   | 4                | None
+inception_v2                   | None       | Conv2d_2c_3x3        | 19   | 4                | None
+inception_v2                   | None       | MaxPool_3a_3x3       | 27   | 8                | None
+inception_v2                   | None       | Mixed_3b             | 59   | 8                | None
+inception_v2                   | None       | Mixed_3c             | 91   | 8                | None
+inception_v2                   | None       | Mixed_4a             | 123  | 16               | None
+inception_v2                   | None       | Mixed_4b             | 187  | 16               | None
+inception_v2                   | None       | Mixed_4c             | 251  | 16               | None
+inception_v2                   | None       | Mixed_4d             | 315  | 16               | None
+inception_v2                   | None       | Mixed_4e             | 379  | 16               | None
+inception_v2                   | None       | Mixed_5a             | 443  | 32               | None
+inception_v2                   | None       | Mixed_5b             | 571  | 32               | None
+inception_v2                   | None       | Mixed_5c             | 699  | 32               | None
+inception_v2                   | 224        | Conv2d_1a_7x7        | 7    | 2                | 2
+inception_v2                   | 224        | MaxPool_2a_3x3       | 11   | 4                | 2
+inception_v2                   | 224        | Conv2d_2b_1x1        | 11   | 4                | 2
+inception_v2                   | 224        | Conv2d_2c_3x3        | 19   | 4                | 6
+inception_v2                   | 224        | MaxPool_3a_3x3       | 27   | 8                | 6
+inception_v2                   | 224        | Mixed_3b             | 59   | 8                | 22
+inception_v2                   | 224        | Mixed_3c             | 91   | 8                | 38
+inception_v2                   | 224        | Mixed_4a             | 123  | 16               | 46
+inception_v2                   | 224        | Mixed_4b             | 187  | 16               | 78
+inception_v2                   | 224        | Mixed_4c             | 251  | 16               | 110
+inception_v2                   | 224        | Mixed_4d             | 315  | 16               | 142
+inception_v2                   | 224        | Mixed_4e             | 379  | 16               | 174
+inception_v2                   | 224        | Mixed_5a             | 443  | 32               | 190
+inception_v2                   | 224        | Mixed_5b             | 571  | 32               | 254
+inception_v2                   | 224        | Mixed_5c             | 699  | 32               | 318
+inception_v2                   | 321        | Conv2d_1a_7x7        | 7    | 2                | 3
+inception_v2                   | 321        | MaxPool_2a_3x3       | 11   | 4                | 5
+inception_v2                   | 321        | Conv2d_2b_1x1        | 11   | 4                | 5
+inception_v2                   | 321        | Conv2d_2c_3x3        | 19   | 4                | 9
+inception_v2                   | 321        | MaxPool_3a_3x3       | 27   | 8                | 13
+inception_v2                   | 321        | Mixed_3b             | 59   | 8                | 29
+inception_v2                   | 321        | Mixed_3c             | 91   | 8                | 45
+inception_v2                   | 321        | Mixed_4a             | 123  | 16               | 61
+inception_v2                   | 321        | Mixed_4b             | 187  | 16               | 93
+inception_v2                   | 321        | Mixed_4c             | 251  | 16               | 125
+inception_v2                   | 321        | Mixed_4d             | 315  | 16               | 157
+inception_v2                   | 321        | Mixed_4e             | 379  | 16               | 189
+inception_v2                   | 321        | Mixed_5a             | 443  | 32               | 221
+inception_v2                   | 321        | Mixed_5b             | 571  | 32               | 285
+inception_v2                   | 321        | Mixed_5c             | 699  | 32               | 349
+inception_v2-no-separable-conv | None       | Conv2d_1a_7x7        | 7    | 2                | None
+inception_v2-no-separable-conv | None       | MaxPool_2a_3x3       | 11   | 4                | None
+inception_v2-no-separable-conv | None       | Conv2d_2b_1x1        | 11   | 4                | None
+inception_v2-no-separable-conv | None       | Conv2d_2c_3x3        | 19   | 4                | None
+inception_v2-no-separable-conv | None       | MaxPool_3a_3x3       | 27   | 8                | None
+inception_v2-no-separable-conv | None       | Mixed_3b             | 59   | 8                | None
+inception_v2-no-separable-conv | None       | Mixed_3c             | 91   | 8                | None
+inception_v2-no-separable-conv | None       | Mixed_4a             | 123  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4b             | 187  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4c             | 251  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4d             | 315  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_4e             | 379  | 16               | None
+inception_v2-no-separable-conv | None       | Mixed_5a             | 443  | 32               | None
+inception_v2-no-separable-conv | None       | Mixed_5b             | 571  | 32               | None
+inception_v2-no-separable-conv | None       | Mixed_5c             | 699  | 32               | None
+inception_v2-no-separable-conv | 224        | Conv2d_1a_7x7        | 7    | 2                | 2
+inception_v2-no-separable-conv | 224        | MaxPool_2a_3x3       | 11   | 4                | 2
+inception_v2-no-separable-conv | 224        | Conv2d_2b_1x1        | 11   | 4                | 2
+inception_v2-no-separable-conv | 224        | Conv2d_2c_3x3        | 19   | 4                | 6
+inception_v2-no-separable-conv | 224        | MaxPool_3a_3x3       | 27   | 8                | 6
+inception_v2-no-separable-conv | 224        | Mixed_3b             | 59   | 8                | 22
+inception_v2-no-separable-conv | 224        | Mixed_3c             | 91   | 8                | 38
+inception_v2-no-separable-conv | 224        | Mixed_4a             | 123  | 16               | 46
+inception_v2-no-separable-conv | 224        | Mixed_4b             | 187  | 16               | 78
+inception_v2-no-separable-conv | 224        | Mixed_4c             | 251  | 16               | 110
+inception_v2-no-separable-conv | 224        | Mixed_4d             | 315  | 16               | 142
+inception_v2-no-separable-conv | 224        | Mixed_4e             | 379  | 16               | 174
+inception_v2-no-separable-conv | 224        | Mixed_5a             | 443  | 32               | 190
+inception_v2-no-separable-conv | 224        | Mixed_5b             | 571  | 32               | 254
+inception_v2-no-separable-conv | 224        | Mixed_5c             | 699  | 32               | 318
+inception_v2-no-separable-conv | 321        | Conv2d_1a_7x7        | 7    | 2                | 3
+inception_v2-no-separable-conv | 321        | MaxPool_2a_3x3       | 11   | 4                | 5
+inception_v2-no-separable-conv | 321        | Conv2d_2b_1x1        | 11   | 4                | 5
+inception_v2-no-separable-conv | 321        | Conv2d_2c_3x3        | 19   | 4                | 9
+inception_v2-no-separable-conv | 321        | MaxPool_3a_3x3       | 27   | 8                | 13
+inception_v2-no-separable-conv | 321        | Mixed_3b             | 59   | 8                | 29
+inception_v2-no-separable-conv | 321        | Mixed_3c             | 91   | 8                | 45
+inception_v2-no-separable-conv | 321        | Mixed_4a             | 123  | 16               | 61
+inception_v2-no-separable-conv | 321        | Mixed_4b             | 187  | 16               | 93
+inception_v2-no-separable-conv | 321        | Mixed_4c             | 251  | 16               | 125
+inception_v2-no-separable-conv | 321        | Mixed_4d             | 315  | 16               | 157
+inception_v2-no-separable-conv | 321        | Mixed_4e             | 379  | 16               | 189
+inception_v2-no-separable-conv | 321        | Mixed_5a             | 443  | 32               | 221
+inception_v2-no-separable-conv | 321        | Mixed_5b             | 571  | 32               | 285
+inception_v2-no-separable-conv | 321        | Mixed_5c             | 699  | 32               | 349
+inception_v3                   | None       | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v3                   | None       | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v3                   | None       | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v3                   | None       | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_v3                   | None       | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_v3                   | None       | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_v3                   | None       | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_v3                   | None       | Mixed_5b             | 63   | 8                | 18
+inception_v3                   | None       | Mixed_5c             | 95   | 8                | 34
+inception_v3                   | None       | Mixed_5d             | 127  | 8                | 50
+inception_v3                   | None       | Mixed_6a             | 159  | 16               | 58
+inception_v3                   | None       | Mixed_6b             | 351  | 16               | 154
+inception_v3                   | None       | Mixed_6c             | 543  | 16               | 250
+inception_v3                   | None       | Mixed_6d             | 735  | 16               | 346
+inception_v3                   | None       | Mixed_6e             | 927  | 16               | 442
+inception_v3                   | None       | Mixed_7a             | 1055 | 32               | 490
+inception_v3                   | None       | Mixed_7b             | 1183 | 32               | 554
+inception_v3                   | None       | Mixed_7c             | 1311 | 32               | 618
+inception_v3                   | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v3                   | 224        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v3                   | 224        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v3                   | 224        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_v3                   | 224        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_v3                   | 224        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_v3                   | 224        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_v3                   | 224        | Mixed_5b             | 63   | 8                | 18
+inception_v3                   | 224        | Mixed_5c             | 95   | 8                | 34
+inception_v3                   | 224        | Mixed_5d             | 127  | 8                | 50
+inception_v3                   | 224        | Mixed_6a             | 159  | 16               | 58
+inception_v3                   | 224        | Mixed_6b             | 351  | 16               | 154
+inception_v3                   | 224        | Mixed_6c             | 543  | 16               | 250
+inception_v3                   | 224        | Mixed_6d             | 735  | 16               | 346
+inception_v3                   | 224        | Mixed_6e             | 927  | 16               | 442
+inception_v3                   | 224        | Mixed_7a             | 1055 | 32               | 490
+inception_v3                   | 224        | Mixed_7b             | 1183 | 32               | 554
+inception_v3                   | 224        | Mixed_7c             | 1311 | 32               | 618
+inception_v3                   | 321        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v3                   | 321        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v3                   | 321        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v3                   | 321        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_v3                   | 321        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_v3                   | 321        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_v3                   | 321        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_v3                   | 321        | Mixed_5b             | 63   | 8                | 18
+inception_v3                   | 321        | Mixed_5c             | 95   | 8                | 34
+inception_v3                   | 321        | Mixed_5d             | 127  | 8                | 50
+inception_v3                   | 321        | Mixed_6a             | 159  | 16               | 58
+inception_v3                   | 321        | Mixed_6b             | 351  | 16               | 154
+inception_v3                   | 321        | Mixed_6c             | 543  | 16               | 250
+inception_v3                   | 321        | Mixed_6d             | 735  | 16               | 346
+inception_v3                   | 321        | Mixed_6e             | 927  | 16               | 442
+inception_v3                   | 321        | Mixed_7a             | 1055 | 32               | 490
+inception_v3                   | 321        | Mixed_7b             | 1183 | 32               | 554
+inception_v3                   | 321        | Mixed_7c             | 1311 | 32               | 618
+inception_v4                   | None       | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v4                   | None       | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v4                   | None       | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v4                   | None       | Mixed_3a             | 15   | 4                | 2
+inception_v4                   | None       | Mixed_4a             | 47   | 4                | 14
+inception_v4                   | None       | Mixed_5a             | 55   | 8                | 14
+inception_v4                   | None       | Mixed_5b             | 87   | 8                | 30
+inception_v4                   | None       | Mixed_5c             | 119  | 8                | 46
+inception_v4                   | None       | Mixed_5d             | 151  | 8                | 62
+inception_v4                   | None       | Mixed_5e             | 183  | 8                | 78
+inception_v4                   | None       | Mixed_6a             | 215  | 16               | 86
+inception_v4                   | None       | Mixed_6b             | 407  | 16               | 182
+inception_v4                   | None       | Mixed_6c             | 599  | 16               | 278
+inception_v4                   | None       | Mixed_6d             | 791  | 16               | 374
+inception_v4                   | None       | Mixed_6e             | 983  | 16               | 470
+inception_v4                   | None       | Mixed_6f             | 1175 | 16               | 566
+inception_v4                   | None       | Mixed_6g             | 1367 | 16               | 662
+inception_v4                   | None       | Mixed_6h             | 1559 | 16               | 758
+inception_v4                   | None       | Mixed_7a             | 1687 | 32               | 806
+inception_v4                   | None       | Mixed_7b             | 1815 | 32               | 870
+inception_v4                   | None       | Mixed_7c             | 1943 | 32               | 934
+inception_v4                   | None       | Mixed_7d             | 2071 | 32               | 998
+inception_v4                   | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v4                   | 224        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v4                   | 224        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v4                   | 224        | Mixed_3a             | 15   | 4                | 2
+inception_v4                   | 224        | Mixed_4a             | 47   | 4                | 14
+inception_v4                   | 224        | Mixed_5a             | 55   | 8                | 14
+inception_v4                   | 224        | Mixed_5b             | 87   | 8                | 30
+inception_v4                   | 224        | Mixed_5c             | 119  | 8                | 46
+inception_v4                   | 224        | Mixed_5d             | 151  | 8                | 62
+inception_v4                   | 224        | Mixed_5e             | 183  | 8                | 78
+inception_v4                   | 224        | Mixed_6a             | 215  | 16               | 86
+inception_v4                   | 224        | Mixed_6b             | 407  | 16               | 182
+inception_v4                   | 224        | Mixed_6c             | 599  | 16               | 278
+inception_v4                   | 224        | Mixed_6d             | 791  | 16               | 374
+inception_v4                   | 224        | Mixed_6e             | 983  | 16               | 470
+inception_v4                   | 224        | Mixed_6f             | 1175 | 16               | 566
+inception_v4                   | 224        | Mixed_6g             | 1367 | 16               | 662
+inception_v4                   | 224        | Mixed_6h             | 1559 | 16               | 758
+inception_v4                   | 224        | Mixed_7a             | 1687 | 32               | 806
+inception_v4                   | 224        | Mixed_7b             | 1815 | 32               | 870
+inception_v4                   | 224        | Mixed_7c             | 1943 | 32               | 934
+inception_v4                   | 224        | Mixed_7d             | 2071 | 32               | 998
+inception_v4                   | 321        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_v4                   | 321        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_v4                   | 321        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_v4                   | 321        | Mixed_3a             | 15   | 4                | 2
+inception_v4                   | 321        | Mixed_4a             | 47   | 4                | 14
+inception_v4                   | 321        | Mixed_5a             | 55   | 8                | 14
+inception_v4                   | 321        | Mixed_5b             | 87   | 8                | 30
+inception_v4                   | 321        | Mixed_5c             | 119  | 8                | 46
+inception_v4                   | 321        | Mixed_5d             | 151  | 8                | 62
+inception_v4                   | 321        | Mixed_5e             | 183  | 8                | 78
+inception_v4                   | 321        | Mixed_6a             | 215  | 16               | 86
+inception_v4                   | 321        | Mixed_6b             | 407  | 16               | 182
+inception_v4                   | 321        | Mixed_6c             | 599  | 16               | 278
+inception_v4                   | 321        | Mixed_6d             | 791  | 16               | 374
+inception_v4                   | 321        | Mixed_6e             | 983  | 16               | 470
+inception_v4                   | 321        | Mixed_6f             | 1175 | 16               | 566
+inception_v4                   | 321        | Mixed_6g             | 1367 | 16               | 662
+inception_v4                   | 321        | Mixed_6h             | 1559 | 16               | 758
+inception_v4                   | 321        | Mixed_7a             | 1687 | 32               | 806
+inception_v4                   | 321        | Mixed_7b             | 1815 | 32               | 870
+inception_v4                   | 321        | Mixed_7c             | 1943 | 32               | 934
+inception_v4                   | 321        | Mixed_7d             | 2071 | 32               | 998
+inception_resnet_v2            | None       | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2            | None       | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_resnet_v2            | None       | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_resnet_v2            | None       | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_resnet_v2            | None       | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_resnet_v2            | None       | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_resnet_v2            | None       | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_resnet_v2            | None       | Mixed_5b             | 63   | 8                | 18
+inception_resnet_v2            | None       | Mixed_6a             | 415  | 16               | 186
+inception_resnet_v2            | None       | PreAuxLogits         | 2335 | 16               | 1146
+inception_resnet_v2            | None       | Mixed_7a             | 2399 | 32               | 1162
+inception_resnet_v2            | None       | Conv2d_7b_1x1        | 3039 | 32               | 1482
+inception_resnet_v2            | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2            | 224        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_resnet_v2            | 224        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_resnet_v2            | 224        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_resnet_v2            | 224        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_resnet_v2            | 224        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_resnet_v2            | 224        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_resnet_v2            | 224        | Mixed_5b             | 63   | 8                | 18
+inception_resnet_v2            | 224        | Mixed_6a             | 415  | 16               | 186
+inception_resnet_v2            | 224        | PreAuxLogits         | 2335 | 16               | 1146
+inception_resnet_v2            | 224        | Mixed_7a             | 2399 | 32               | 1162
+inception_resnet_v2            | 224        | Conv2d_7b_1x1        | 3039 | 32               | 1482
+inception_resnet_v2            | 321        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2            | 321        | Conv2d_2a_3x3        | 7    | 2                | 0
+inception_resnet_v2            | 321        | Conv2d_2b_3x3        | 11   | 2                | 2
+inception_resnet_v2            | 321        | MaxPool_3a_3x3       | 15   | 4                | 2
+inception_resnet_v2            | 321        | Conv2d_3b_1x1        | 15   | 4                | 2
+inception_resnet_v2            | 321        | Conv2d_4a_3x3        | 23   | 4                | 2
+inception_resnet_v2            | 321        | MaxPool_5a_3x3       | 31   | 8                | 2
+inception_resnet_v2            | 321        | Mixed_5b             | 63   | 8                | 18
+inception_resnet_v2            | 321        | Mixed_6a             | 415  | 16               | 186
+inception_resnet_v2            | 321        | PreAuxLogits         | 2335 | 16               | 1146
+inception_resnet_v2            | 321        | Mixed_7a             | 2399 | 32               | 1162
+inception_resnet_v2            | 321        | Conv2d_7b_1x1        | 3039 | 32               | 1482
+inception_resnet_v2-same       | None       | Conv2d_1a_3x3        | 3    | 2                | None
+inception_resnet_v2-same       | None       | Conv2d_2a_3x3        | 7    | 2                | None
+inception_resnet_v2-same       | None       | Conv2d_2b_3x3        | 11   | 2                | None
+inception_resnet_v2-same       | None       | MaxPool_3a_3x3       | 15   | 4                | None
+inception_resnet_v2-same       | None       | Conv2d_3b_1x1        | 15   | 4                | None
+inception_resnet_v2-same       | None       | Conv2d_4a_3x3        | 23   | 4                | None
+inception_resnet_v2-same       | None       | MaxPool_5a_3x3       | 31   | 8                | None
+inception_resnet_v2-same       | None       | Mixed_5b             | 63   | 8                | None
+inception_resnet_v2-same       | None       | Mixed_6a             | 415  | 16               | None
+inception_resnet_v2-same       | None       | PreAuxLogits         | 2335 | 16               | None
+inception_resnet_v2-same       | None       | Mixed_7a             | 2399 | 32               | None
+inception_resnet_v2-same       | None       | Conv2d_7b_1x1        | 3039 | 32               | None
+inception_resnet_v2-same       | 224        | Conv2d_1a_3x3        | 3    | 2                | 0
+inception_resnet_v2-same       | 224        | Conv2d_2a_3x3        | 7    | 2                | 2
+inception_resnet_v2-same       | 224        | Conv2d_2b_3x3        | 11   | 2                | 4
+inception_resnet_v2-same       | 224        | MaxPool_3a_3x3       | 15   | 4                | 4
+inception_resnet_v2-same       | 224        | Conv2d_3b_1x1        | 15   | 4                | 4
+inception_resnet_v2-same       | 224        | Conv2d_4a_3x3        | 23   | 4                | 8
+inception_resnet_v2-same       | 224        | MaxPool_5a_3x3       | 31   | 8                | 8
+inception_resnet_v2-same       | 224        | Mixed_5b             | 63   | 8                | 24
+inception_resnet_v2-same       | 224        | Mixed_6a             | 415  | 16               | 192
+inception_resnet_v2-same       | 224        | PreAuxLogits         | 2335 | 16               | 1152
+inception_resnet_v2-same       | 224        | Mixed_7a             | 2399 | 32               | 1168
+inception_resnet_v2-same       | 224        | Conv2d_7b_1x1        | 3039 | 32               | 1488
+inception_resnet_v2-same       | 321        | Conv2d_1a_3x3        | 3    | 2                | 1
+inception_resnet_v2-same       | 321        | Conv2d_2a_3x3        | 7    | 2                | 3
+inception_resnet_v2-same       | 321        | Conv2d_2b_3x3        | 11   | 2                | 5
+inception_resnet_v2-same       | 321        | MaxPool_3a_3x3       | 15   | 4                | 7
+inception_resnet_v2-same       | 321        | Conv2d_3b_1x1        | 15   | 4                | 7
+inception_resnet_v2-same       | 321        | Conv2d_4a_3x3        | 23   | 4                | 11
+inception_resnet_v2-same       | 321        | MaxPool_5a_3x3       | 31   | 8                | 15
+inception_resnet_v2-same       | 321        | Mixed_5b             | 63   | 8                | 31
+inception_resnet_v2-same       | 321        | Mixed_6a             | 415  | 16               | 207
+inception_resnet_v2-same       | 321        | PreAuxLogits         | 2335 | 16               | 1167
+inception_resnet_v2-same       | 321        | Mixed_7a             | 2399 | 32               | 1199
+inception_resnet_v2-same       | 321        | Conv2d_7b_1x1        | 3039 | 32               | 1519
+mobilenet_v1                   | None       | Conv2d_0             | 3    | 2                | None
+mobilenet_v1                   | None       | Conv2d_1_pointwise   | 7    | 2                | None
+mobilenet_v1                   | None       | Conv2d_2_pointwise   | 11   | 4                | None
+mobilenet_v1                   | None       | Conv2d_3_pointwise   | 19   | 4                | None
+mobilenet_v1                   | None       | Conv2d_4_pointwise   | 27   | 8                | None
+mobilenet_v1                   | None       | Conv2d_5_pointwise   | 43   | 8                | None
+mobilenet_v1                   | None       | Conv2d_6_pointwise   | 59   | 16               | None
+mobilenet_v1                   | None       | Conv2d_7_pointwise   | 91   | 16               | None
+mobilenet_v1                   | None       | Conv2d_8_pointwise   | 123  | 16               | None
+mobilenet_v1                   | None       | Conv2d_9_pointwise   | 155  | 16               | None
+mobilenet_v1                   | None       | Conv2d_10_pointwise  | 187  | 16               | None
+mobilenet_v1                   | None       | Conv2d_11_pointwise  | 219  | 16               | None
+mobilenet_v1                   | None       | Conv2d_12_pointwise  | 251  | 32               | None
+mobilenet_v1                   | None       | Conv2d_13_pointwise  | 315  | 32               | None
+mobilenet_v1                   | 224        | Conv2d_0             | 3    | 2                | 0
+mobilenet_v1                   | 224        | Conv2d_1_pointwise   | 7    | 2                | 2
+mobilenet_v1                   | 224        | Conv2d_2_pointwise   | 11   | 4                | 2
+mobilenet_v1                   | 224        | Conv2d_3_pointwise   | 19   | 4                | 6
+mobilenet_v1                   | 224        | Conv2d_4_pointwise   | 27   | 8                | 6
+mobilenet_v1                   | 224        | Conv2d_5_pointwise   | 43   | 8                | 14
+mobilenet_v1                   | 224        | Conv2d_6_pointwise   | 59   | 16               | 14
+mobilenet_v1                   | 224        | Conv2d_7_pointwise   | 91   | 16               | 30
+mobilenet_v1                   | 224        | Conv2d_8_pointwise   | 123  | 16               | 46
+mobilenet_v1                   | 224        | Conv2d_9_pointwise   | 155  | 16               | 62
+mobilenet_v1                   | 224        | Conv2d_10_pointwise  | 187  | 16               | 78
+mobilenet_v1                   | 224        | Conv2d_11_pointwise  | 219  | 16               | 94
+mobilenet_v1                   | 224        | Conv2d_12_pointwise  | 251  | 32               | 94
+mobilenet_v1                   | 224        | Conv2d_13_pointwise  | 315  | 32               | 126
+mobilenet_v1                   | 321        | Conv2d_0             | 3    | 2                | 1
+mobilenet_v1                   | 321        | Conv2d_1_pointwise   | 7    | 2                | 3
+mobilenet_v1                   | 321        | Conv2d_2_pointwise   | 11   | 4                | 5
+mobilenet_v1                   | 321        | Conv2d_3_pointwise   | 19   | 4                | 9
+mobilenet_v1                   | 321        | Conv2d_4_pointwise   | 27   | 8                | 13
+mobilenet_v1                   | 321        | Conv2d_5_pointwise   | 43   | 8                | 21
+mobilenet_v1                   | 321        | Conv2d_6_pointwise   | 59   | 16               | 29
+mobilenet_v1                   | 321        | Conv2d_7_pointwise   | 91   | 16               | 45
+mobilenet_v1                   | 321        | Conv2d_8_pointwise   | 123  | 16               | 61
+mobilenet_v1                   | 321        | Conv2d_9_pointwise   | 155  | 16               | 77
+mobilenet_v1                   | 321        | Conv2d_10_pointwise  | 187  | 16               | 93
+mobilenet_v1                   | 321        | Conv2d_11_pointwise  | 219  | 16               | 109
+mobilenet_v1                   | 321        | Conv2d_12_pointwise  | 251  | 32               | 125
+mobilenet_v1                   | 321        | Conv2d_13_pointwise  | 315  | 32               | 157
+mobilenet_v1_075               | None       | Conv2d_0             | 3    | 2                | None
+mobilenet_v1_075               | None       | Conv2d_1_pointwise   | 7    | 2                | None
+mobilenet_v1_075               | None       | Conv2d_2_pointwise   | 11   | 4                | None
+mobilenet_v1_075               | None       | Conv2d_3_pointwise   | 19   | 4                | None
+mobilenet_v1_075               | None       | Conv2d_4_pointwise   | 27   | 8                | None
+mobilenet_v1_075               | None       | Conv2d_5_pointwise   | 43   | 8                | None
+mobilenet_v1_075               | None       | Conv2d_6_pointwise   | 59   | 16               | None
+mobilenet_v1_075               | None       | Conv2d_7_pointwise   | 91   | 16               | None
+mobilenet_v1_075               | None       | Conv2d_8_pointwise   | 123  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_9_pointwise   | 155  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_10_pointwise  | 187  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_11_pointwise  | 219  | 16               | None
+mobilenet_v1_075               | None       | Conv2d_12_pointwise  | 251  | 32               | None
+mobilenet_v1_075               | None       | Conv2d_13_pointwise  | 315  | 32               | None
+mobilenet_v1_075               | 224        | Conv2d_0             | 3    | 2                | 0
+mobilenet_v1_075               | 224        | Conv2d_1_pointwise   | 7    | 2                | 2
+mobilenet_v1_075               | 224        | Conv2d_2_pointwise   | 11   | 4                | 2
+mobilenet_v1_075               | 224        | Conv2d_3_pointwise   | 19   | 4                | 6
+mobilenet_v1_075               | 224        | Conv2d_4_pointwise   | 27   | 8                | 6
+mobilenet_v1_075               | 224        | Conv2d_5_pointwise   | 43   | 8                | 14
+mobilenet_v1_075               | 224        | Conv2d_6_pointwise   | 59   | 16               | 14
+mobilenet_v1_075               | 224        | Conv2d_7_pointwise   | 91   | 16               | 30
+mobilenet_v1_075               | 224        | Conv2d_8_pointwise   | 123  | 16               | 46
+mobilenet_v1_075               | 224        | Conv2d_9_pointwise   | 155  | 16               | 62
+mobilenet_v1_075               | 224        | Conv2d_10_pointwise  | 187  | 16               | 78
+mobilenet_v1_075               | 224        | Conv2d_11_pointwise  | 219  | 16               | 94
+mobilenet_v1_075               | 224        | Conv2d_12_pointwise  | 251  | 32               | 94
+mobilenet_v1_075               | 224        | Conv2d_13_pointwise  | 315  | 32               | 126
+mobilenet_v1_075               | 321        | Conv2d_0             | 3    | 2                | 1
+mobilenet_v1_075               | 321        | Conv2d_1_pointwise   | 7    | 2                | 3
+mobilenet_v1_075               | 321        | Conv2d_2_pointwise   | 11   | 4                | 5
+mobilenet_v1_075               | 321        | Conv2d_3_pointwise   | 19   | 4                | 9
+mobilenet_v1_075               | 321        | Conv2d_4_pointwise   | 27   | 8                | 13
+mobilenet_v1_075               | 321        | Conv2d_5_pointwise   | 43   | 8                | 21
+mobilenet_v1_075               | 321        | Conv2d_6_pointwise   | 59   | 16               | 29
+mobilenet_v1_075               | 321        | Conv2d_7_pointwise   | 91   | 16               | 45
+mobilenet_v1_075               | 321        | Conv2d_8_pointwise   | 123  | 16               | 61
+mobilenet_v1_075               | 321        | Conv2d_9_pointwise   | 155  | 16               | 77
+mobilenet_v1_075               | 321        | Conv2d_10_pointwise  | 187  | 16               | 93
+mobilenet_v1_075               | 321        | Conv2d_11_pointwise  | 219  | 16               | 109
+mobilenet_v1_075               | 321        | Conv2d_12_pointwise  | 251  | 32               | 125
+mobilenet_v1_075               | 321        | Conv2d_13_pointwise  | 315  | 32               | 157
+resnet_v1_50                   | None       | resnet_v1_50/block1  | 35   | 8                | None
+resnet_v1_50                   | None       | resnet_v1_50/block2  | 99   | 16               | None
+resnet_v1_50                   | None       | resnet_v1_50/block3  | 291  | 32               | None
+resnet_v1_50                   | None       | resnet_v1_50/block4  | 483  | 32               | None
+resnet_v1_50                   | 224        | resnet_v1_50/block1  | 35   | 8                | 15
+resnet_v1_50                   | 224        | resnet_v1_50/block2  | 99   | 16               | 47
+resnet_v1_50                   | 224        | resnet_v1_50/block3  | 291  | 32               | 143
+resnet_v1_50                   | 224        | resnet_v1_50/block4  | 483  | 32               | 239
+resnet_v1_50                   | 321        | resnet_v1_50/block1  | 35   | 8                | 17
+resnet_v1_50                   | 321        | resnet_v1_50/block2  | 99   | 16               | 49
+resnet_v1_50                   | 321        | resnet_v1_50/block3  | 291  | 32               | 145
+resnet_v1_50                   | 321        | resnet_v1_50/block4  | 483  | 32               | 241
+resnet_v1_101                  | None       | resnet_v1_101/block1 | 35   | 8                | None
+resnet_v1_101                  | None       | resnet_v1_101/block2 | 99   | 16               | None
+resnet_v1_101                  | None       | resnet_v1_101/block3 | 835  | 32               | None
+resnet_v1_101                  | None       | resnet_v1_101/block4 | 1027 | 32               | None
+resnet_v1_101                  | 224        | resnet_v1_101/block1 | 35   | 8                | 15
+resnet_v1_101                  | 224        | resnet_v1_101/block2 | 99   | 16               | 47
+resnet_v1_101                  | 224        | resnet_v1_101/block3 | 835  | 32               | 415
+resnet_v1_101                  | 224        | resnet_v1_101/block4 | 1027 | 32               | 511
+resnet_v1_101                  | 321        | resnet_v1_101/block1 | 35   | 8                | 17
+resnet_v1_101                  | 321        | resnet_v1_101/block2 | 99   | 16               | 49
+resnet_v1_101                  | 321        | resnet_v1_101/block3 | 835  | 32               | 417
+resnet_v1_101                  | 321        | resnet_v1_101/block4 | 1027 | 32               | 513
+resnet_v1_152                  | None       | resnet_v1_152/block1 | 35   | 8                | None
+resnet_v1_152                  | None       | resnet_v1_152/block2 | 163  | 16               | None
+resnet_v1_152                  | None       | resnet_v1_152/block3 | 1315 | 32               | None
+resnet_v1_152                  | None       | resnet_v1_152/block4 | 1507 | 32               | None
+resnet_v1_152                  | 224        | resnet_v1_152/block1 | 35   | 8                | 15
+resnet_v1_152                  | 224        | resnet_v1_152/block2 | 163  | 16               | 79
+resnet_v1_152                  | 224        | resnet_v1_152/block3 | 1315 | 32               | 655
+resnet_v1_152                  | 224        | resnet_v1_152/block4 | 1507 | 32               | 751
+resnet_v1_152                  | 321        | resnet_v1_152/block1 | 35   | 8                | 17
+resnet_v1_152                  | 321        | resnet_v1_152/block2 | 163  | 16               | 81
+resnet_v1_152                  | 321        | resnet_v1_152/block3 | 1315 | 32               | 657
+resnet_v1_152                  | 321        | resnet_v1_152/block4 | 1507 | 32               | 753
+resnet_v1_200                  | None       | resnet_v1_200/block1 | 35   | 8                | None
+resnet_v1_200                  | None       | resnet_v1_200/block2 | 419  | 16               | None
+resnet_v1_200                  | None       | resnet_v1_200/block3 | 1571 | 32               | None
+resnet_v1_200                  | None       | resnet_v1_200/block4 | 1763 | 32               | None
+resnet_v1_200                  | 224        | resnet_v1_200/block1 | 35   | 8                | 15
+resnet_v1_200                  | 224        | resnet_v1_200/block2 | 419  | 16               | 207
+resnet_v1_200                  | 224        | resnet_v1_200/block3 | 1571 | 32               | 783
+resnet_v1_200                  | 224        | resnet_v1_200/block4 | 1763 | 32               | 879
+resnet_v1_200                  | 321        | resnet_v1_200/block1 | 35   | 8                | 17
+resnet_v1_200                  | 321        | resnet_v1_200/block2 | 419  | 16               | 209
+resnet_v1_200                  | 321        | resnet_v1_200/block3 | 1571 | 32               | 785
+resnet_v1_200                  | 321        | resnet_v1_200/block4 | 1763 | 32               | 881
+resnet_v2_50                   | None       | resnet_v2_50/block1  | 35   | 8                | None
+resnet_v2_50                   | None       | resnet_v2_50/block2  | 99   | 16               | None
+resnet_v2_50                   | None       | resnet_v2_50/block3  | 291  | 32               | None
+resnet_v2_50                   | None       | resnet_v2_50/block4  | 483  | 32               | None
+resnet_v2_50                   | 224        | resnet_v2_50/block1  | 35   | 8                | 15
+resnet_v2_50                   | 224        | resnet_v2_50/block2  | 99   | 16               | 47
+resnet_v2_50                   | 224        | resnet_v2_50/block3  | 291  | 32               | 143
+resnet_v2_50                   | 224        | resnet_v2_50/block4  | 483  | 32               | 239
+resnet_v2_50                   | 321        | resnet_v2_50/block1  | 35   | 8                | 17
+resnet_v2_50                   | 321        | resnet_v2_50/block2  | 99   | 16               | 49
+resnet_v2_50                   | 321        | resnet_v2_50/block3  | 291  | 32               | 145
+resnet_v2_50                   | 321        | resnet_v2_50/block4  | 483  | 32               | 241
+resnet_v2_101                  | None       | resnet_v2_101/block1 | 35   | 8                | None
+resnet_v2_101                  | None       | resnet_v2_101/block2 | 99   | 16               | None
+resnet_v2_101                  | None       | resnet_v2_101/block3 | 835  | 32               | None
+resnet_v2_101                  | None       | resnet_v2_101/block4 | 1027 | 32               | None
+resnet_v2_101                  | 224        | resnet_v2_101/block1 | 35   | 8                | 15
+resnet_v2_101                  | 224        | resnet_v2_101/block2 | 99   | 16               | 47
+resnet_v2_101                  | 224        | resnet_v2_101/block3 | 835  | 32               | 415
+resnet_v2_101                  | 224        | resnet_v2_101/block4 | 1027 | 32               | 511
+resnet_v2_101                  | 321        | resnet_v2_101/block1 | 35   | 8                | 17
+resnet_v2_101                  | 321        | resnet_v2_101/block2 | 99   | 16               | 49
+resnet_v2_101                  | 321        | resnet_v2_101/block3 | 835  | 32               | 417
+resnet_v2_101                  | 321        | resnet_v2_101/block4 | 1027 | 32               | 513
+resnet_v2_152                  | None       | resnet_v2_152/block1 | 35   | 8                | None
+resnet_v2_152                  | None       | resnet_v2_152/block2 | 163  | 16               | None
+resnet_v2_152                  | None       | resnet_v2_152/block3 | 1315 | 32               | None
+resnet_v2_152                  | None       | resnet_v2_152/block4 | 1507 | 32               | None
+resnet_v2_152                  | 224        | resnet_v2_152/block1 | 35   | 8                | 15
+resnet_v2_152                  | 224        | resnet_v2_152/block2 | 163  | 16               | 79
+resnet_v2_152                  | 224        | resnet_v2_152/block3 | 1315 | 32               | 655
+resnet_v2_152                  | 224        | resnet_v2_152/block4 | 1507 | 32               | 751
+resnet_v2_152                  | 321        | resnet_v2_152/block1 | 35   | 8                | 17
+resnet_v2_152                  | 321        | resnet_v2_152/block2 | 163  | 16               | 81
+resnet_v2_152                  | 321        | resnet_v2_152/block3 | 1315 | 32               | 657
+resnet_v2_152                  | 321        | resnet_v2_152/block4 | 1507 | 32               | 753
+resnet_v2_200                  | None       | resnet_v2_200/block1 | 35   | 8                | None
+resnet_v2_200                  | None       | resnet_v2_200/block2 | 419  | 16               | None
+resnet_v2_200                  | None       | resnet_v2_200/block3 | 1571 | 32               | None
+resnet_v2_200                  | None       | resnet_v2_200/block4 | 1763 | 32               | None
+resnet_v2_200                  | 224        | resnet_v2_200/block1 | 35   | 8                | 15
+resnet_v2_200                  | 224        | resnet_v2_200/block2 | 419  | 16               | 207
+resnet_v2_200                  | 224        | resnet_v2_200/block3 | 1571 | 32               | 783
+resnet_v2_200                  | 224        | resnet_v2_200/block4 | 1763 | 32               | 879
+resnet_v2_200                  | 321        | resnet_v2_200/block1 | 35   | 8                | 17
+resnet_v2_200                  | 321        | resnet_v2_200/block2 | 419  | 16               | 209
+resnet_v2_200                  | 321        | resnet_v2_200/block3 | 1571 | 32               | 785
+resnet_v2_200                  | 321        | resnet_v2_200/block4 | 1763 | 32               | 881
+
+## FAQ
+
+### What does a resolution of 'None' mean?
+
+In this case, the input resolution is undefined. For most models, the receptive
+field parameters can be computed even without knowing the input resolution.
+
+### For some networks, effective_padding shows as 'None' (eg, for Inception_v2 or Mobilenet_v1 when input size is not specified). Why is that?
+
+This means that the padding for these networks depends on the input size. So,
+unless we know exactly the input image dimensionality to be used, it is not
+possible to determine the padding applied at the different layers. Look at the
+other entries where the input size is fixed; for those cases, effective_padding
+is not None.
+
+This happens due to Tensorflow's implementation of the 'SAME' padding mode,
+which may depend on the input feature map size to a given layer. For background
+on this, see [these notes from the TF
+documentation](https://www.tensorflow.org/versions/master/api_guides/python/nn#Notes_on_SAME_Convolution_Padding).
+
+Also, note that in this case the program is not able to check if the network is
+aligned (ie, it could be that the different paths from input to output have
+receptive fields which are not consistently centered at the same position in the
+input image).
+
+So you should be aware that such networks might not be aligned -- the program
+has no way of checking it when the padding cannot be determined.
+
+### The receptive field parameters for network X seem different from what I expected... maybe your calculation is incorrect?
+
+First, note that the results presented here are based on the tensorflow
+implementations from the [TF-Slim model
+library](https://github.com/tensorflow/models/tree/master/research/slim).
+
+So, it is possible that due to some implementation details the RF parameters are
+different.
+
+One common case of confusion is the TF-Slim Resnet implementation, which applies
+stride in the last residual unit of each block, instead of at the input
+activations in the first residual unit of each block (which is what is described
+in the Resnet paper) -- see [this
+comment](https://github.com/tensorflow/models/blob/master/research/slim/nets/resnet_utils.py#L30).
+This makes the stride with respect to each convolution block potentially
+different. In this case, though, note that a
+[flag](https://github.com/tensorflow/models/blob/master/research/slim/nets/resnet_v1.py#L150)
+may be used to recover the original striding convention.
+
+Second, it could be that we have a bug somewhere. While we include [many
+tests](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/receptive_field/python/util/receptive_field_test.py)
+in our library, it is always possible that we missed something. If you suspect
+this is happening, please file a GitHub issue
+[here](https://github.com/tensorflow/tensorflow/issues).
diff --git a/tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py b/tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py
new file mode 100644
index 0000000000..4495d74bbf
--- /dev/null
+++ b/tensorflow/contrib/receptive_field/python/util/examples/csv_to_markdown_table.py
@@ -0,0 +1,82 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Simple script to convert CSV output from rf_benchmark to Markdown format.
+
+The input CSV should have the following fields:
+- CNN
+- input resolution
+- end_point
+- RF size hor
+- RF size ver
+- effective stride hor
+- effective stride ver
+- effective padding hor
+- effective padding ver
+
+Since usually in all cases the parameters in the horizontal and vertical
+directions are the same, this is assumed by this script, which only prints one
+of them to the Markdown file.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import csv
+import sys
+
+from tensorflow.python.platform import app
+
+cmd_args = None
+
+
+def main(unused_argv):
+  with open(cmd_args.markdown_path, 'w') as f:
+    # Write table header and field size.
+    f.write('CNN | resolution | end-point | RF | effective stride | '
+            'effective padding|\n')
+    f.write(
+        ':--------------------: | :----------: | :---------------: | :-----: |'
+        ' :----: | :----:|\n')
+    with open(cmd_args.csv_path) as csvfile:
+      reader = csv.DictReader(csvfile)
+      for row in reader:
+        # Make sure horizontal and parameters are the same.
+        assert row['RF size hor'] == row['RF size ver']
+        assert row['effective stride hor'] == row['effective stride ver']
+        assert row['effective padding hor'] == row['effective padding ver']
+
+        f.write('%s|%s|%s|%s|%s|%s\n' %
+                (row['CNN'], row['input resolution'], row['end_point'],
+                 row['RF size hor'], row['effective stride hor'],
+                 row['effective padding hor']))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.register('type', 'bool', lambda v: v.lower() == 'true')
+  parser.add_argument(
+      '--csv_path',
+      type=str,
+      default='/tmp/rf.csv',
+      help='Path where CSV output of rf_benchmark was saved.')
+  parser.add_argument(
+      '--markdown_path',
+      type=str,
+      default='/tmp/rf.md',
+      help='Path where Markdown output will be saved.')
+  cmd_args, unparsed = parser.parse_known_args()
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
-- 
GitLab


From 72f6b4d93059086c453d344103c3bfe308a4e90d Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 5 Jun 2018 09:18:14 -0700
Subject: [PATCH 305/610] Delete "RuntimeWarning" it is not having the intended
 effect.

These `RuntimeWarning` are being interpreted as arguments to the string formatting, raising "TypeError: not all arguments converted during string formatting" errors.

PiperOrigin-RevId: 199307228
---
 tensorflow/python/keras/callbacks.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index 36782728e8..8061d47295 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -424,7 +424,7 @@ class ModelCheckpoint(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('ModelCheckpoint mode %s is unknown, '
-                      'fallback to auto mode.', (mode), RuntimeWarning)
+                      'fallback to auto mode.', mode)
       mode = 'auto'
 
     if mode == 'min':
@@ -451,7 +451,7 @@ class ModelCheckpoint(Callback):
         current = logs.get(self.monitor)
         if current is None:
           logging.warning('Can save best model only with %s available, '
-                          'skipping.', self.monitor, RuntimeWarning)
+                          'skipping.', self.monitor)
         else:
           if self.monitor_op(current, self.best):
             if self.verbose > 0:
@@ -515,7 +515,7 @@ class EarlyStopping(Callback):
 
     if mode not in ['auto', 'min', 'max']:
       logging.warning('EarlyStopping mode %s is unknown, '
-                      'fallback to auto mode.', mode, RuntimeWarning)
+                      'fallback to auto mode.', mode)
       mode = 'auto'
 
     if mode == 'min':
@@ -544,7 +544,7 @@ class EarlyStopping(Callback):
     if current is None:
       logging.warning('Early stopping conditioned on metric `%s` '
                       'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+                      self.monitor, ','.join(list(logs.keys())))
       return
     if self.monitor_op(current - self.min_delta, self.best):
       self.best = current
@@ -898,7 +898,7 @@ class ReduceLROnPlateau(Callback):
     """
     if self.mode not in ['auto', 'min', 'max']:
       logging.warning('Learning Rate Plateau Reducing mode %s is unknown, '
-                      'fallback to auto mode.', self.mode, RuntimeWarning)
+                      'fallback to auto mode.', self.mode)
       self.mode = 'auto'
     if (self.mode == 'min' or
         (self.mode == 'auto' and 'acc' not in self.monitor)):
@@ -920,7 +920,7 @@ class ReduceLROnPlateau(Callback):
     if current is None:
       logging.warning('Reduce LR on plateau conditioned on metric `%s` '
                       'which is not available. Available metrics are: %s',
-                      self.monitor, ','.join(list(logs.keys())), RuntimeWarning)
+                      self.monitor, ','.join(list(logs.keys())))
 
     else:
       if self.in_cooldown():
-- 
GitLab


From 16a4b1e09f45eb329bdfc9811a3ea84571c6380e Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Tue, 5 Jun 2018 09:25:57 -0700
Subject: [PATCH 306/610] Automated g4 rollback of changelist 199244092

PiperOrigin-RevId: 199308328
---
 .../xla/service/algebraic_simplifier_test.cc  | 47 ++++++++++---------
 .../xla/tests/hlo_verified_test_base.cc       | 20 +++-----
 .../xla/tests/hlo_verified_test_base.h        | 16 +------
 3 files changed, 32 insertions(+), 51 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 27eb48181e..cda157f9fa 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1714,7 +1714,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1759,7 +1759,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -1781,7 +1781,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1804,7 +1804,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1932,8 +1932,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     b.AddInstruction(HloInstruction::CreateConvolve(out_shape, input, filter,
                                                     window, dnums));
 
-    // TODO(b/80488902): verify this module.
-    auto module = HloTestBase::CreateNewModule();
+    auto module = CreateNewModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
     AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
@@ -2061,7 +2060,7 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2091,7 +2090,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2122,7 +2121,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2152,7 +2151,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
@@ -2185,7 +2184,7 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2201,8 +2200,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
       HloInstruction::CreateParameter(0, r0f32, "scalar_param"));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
-  HloInstruction* broadcast = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(broadcast_shape, scalar_param, {}));
+  HloInstruction* broadcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(
+          broadcast_shape, scalar_param,
+          AsInt64Slice(broadcast_shape.dimensions())));
 
   Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -2218,10 +2219,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
 
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2236,8 +2237,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6});
-  HloInstruction* broadcast = builder.AddInstruction(
-      HloInstruction::CreateBroadcast(broadcast_shape, forty_two, {}));
+  HloInstruction* broadcast =
+      builder.AddInstruction(HloInstruction::CreateBroadcast(
+          broadcast_shape, forty_two,
+          AsInt64Slice(broadcast_shape.dimensions())));
 
   HloInstruction* transpose =
       builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -2256,7 +2259,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2265,8 +2268,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
-  // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2347,8 +2349,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
-  // TODO(b/80488902): verify this module.
-  auto module = HloTestBase::CreateNewModule();
+  auto module = CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2443,7 +2444,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index 22c664d142..c8a05c2e9e 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -41,17 +41,14 @@ void HloVerifiedTestBase::TearDown() {
       << "TearDown called more than once; it should be called exactly once.";
   tear_down_called_ = true;
   if (module_) {
-    VerifyModule(module_.get());
-  }
-  for (int i = 0; i < modules_.size(); ++i) {
-    VerifyModule(modules_.at(i).get());
+    VerifyModule();
   }
   HloTestBase::TearDown();
 }
 
-void HloVerifiedTestBase::VerifyModule(HloModule* module) {
-  HloVerifier verifier(/*allow_mixed_precision=*/true);
-  xla::StatusOr<bool> mutated = verifier.Run(module);
+void HloVerifiedTestBase::VerifyModule() {
+  HloVerifier verifier;
+  xla::StatusOr<bool> mutated = verifier.Run(module_.get());
   if (!mutated.ok()) {
     ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
   } else {
@@ -62,20 +59,15 @@ void HloVerifiedTestBase::VerifyModule(HloModule* module) {
 
 HloModule& HloVerifiedTestBase::module() {
   if (!module_) {
-    module_ = HloTestBase::CreateNewModule();
+    module_ = CreateNewModule();
   }
   return *module_;
 }
 
-HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
-  modules_.emplace_back(HloTestBase::CreateNewModule());
-  return modules_.back().get();
-}
-
 void HloVerifiedTestBase::ParseAndVerifyModule(
     tensorflow::StringPiece hlo_text) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
   TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
-  VerifyModule(module_.get());
+  VerifyModule();
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index 5b59cc77f6..e5bb14a883 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -52,23 +52,11 @@ class HloVerifiedTestBase : public HloTestBase {
     shape_verifier_ = std::move(shape_verifier);
   }
 
-  // Creates a new module for a test, and stores it in modules_ so it can be
-  // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
-  // creation of unverified modules.
-  HloModule* CreateNewModule(const string& name = TestName());
-
-  // It is confusing to store modules created by module() and CreateNewModule()
-  // in different fields, but it allows us to migrate tests to
-  // HloVerifiedTestBase more easily, so it's a win because we can verify more
-  // modules. See b/80488902.
  private:
-  // Lazily populated. Access via module().
-  std::unique_ptr<HloModule> module_;
-  // Populated by calls to CreateNewModule.
-  std::vector<std::unique_ptr<HloModule>> modules_;
+  std::unique_ptr<HloModule> module_;  // Lazily populated. Access via module().
   std::unique_ptr<ShapeVerifier> shape_verifier_;
   bool tear_down_called_ = false;
-  static void VerifyModule(HloModule* module);
+  void VerifyModule();
 };
 
 }  // namespace xla
-- 
GitLab


From ad1fc6b020e08c7a1092bfb85a175a3c5ddf4405 Mon Sep 17 00:00:00 2001
From: Christopher Suter <cgs@google.com>
Date: Tue, 5 Jun 2018 09:26:45 -0700
Subject: [PATCH 307/610] Eliminate nested try/catch's in
 Distribution._call_prob and friends. These nested try/catches have the
 unintended effect of hiding any downstream NotImplementedErrors and replacing
 them with an earlier exception.

PiperOrigin-RevId: 199308457
---
 .../python/ops/distributions/distribution.py  | 61 ++++++-------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/tensorflow/python/ops/distributions/distribution.py b/tensorflow/python/ops/distributions/distribution.py
index 0db4749507..41dcd40188 100644
--- a/tensorflow/python/ops/distributions/distribution.py
+++ b/tensorflow/python/ops/distributions/distribution.py
@@ -722,11 +722,8 @@ class Distribution(_BaseDistribution):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._log_prob(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.log(self._prob(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.log(self._prob(value, **kwargs))
 
   def log_prob(self, value, name="log_prob"):
     """Log probability density/mass function.
@@ -749,11 +746,8 @@ class Distribution(_BaseDistribution):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._prob(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.exp(self._log_prob(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.exp(self._log_prob(value, **kwargs))
 
   def prob(self, value, name="prob"):
     """Probability density/mass function.
@@ -776,11 +770,8 @@ class Distribution(_BaseDistribution):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._log_cdf(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.log(self._cdf(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.log(self._cdf(value, **kwargs))
 
   def log_cdf(self, value, name="log_cdf"):
     """Log cumulative distribution function.
@@ -813,11 +804,8 @@ class Distribution(_BaseDistribution):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._cdf(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.exp(self._log_cdf(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.exp(self._log_cdf(value, **kwargs))
 
   def cdf(self, value, name="cdf"):
     """Cumulative distribution function.
@@ -846,11 +834,8 @@ class Distribution(_BaseDistribution):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._log_survival_function(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.log1p(-self.cdf(value, **kwargs))
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.log1p(-self.cdf(value, **kwargs))
 
   def log_survival_function(self, value, name="log_survival_function"):
     """Log survival function.
@@ -884,11 +869,8 @@ class Distribution(_BaseDistribution):
       value = ops.convert_to_tensor(value, name="value")
       try:
         return self._survival_function(value, **kwargs)
-      except NotImplementedError as original_exception:
-        try:
-          return 1. - self.cdf(value, **kwargs)
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return 1. - self.cdf(value, **kwargs)
 
   def survival_function(self, value, name="survival_function"):
     """Survival function.
@@ -933,10 +915,7 @@ class Distribution(_BaseDistribution):
   def _call_quantile(self, value, name, **kwargs):
     with self._name_scope(name, values=[value]):
       value = ops.convert_to_tensor(value, name="value")
-      try:
-        return self._quantile(value, **kwargs)
-      except NotImplementedError as original_exception:
-        raise original_exception
+      return self._quantile(value, **kwargs)
 
   def quantile(self, value, name="quantile"):
     """Quantile function. Aka "inverse cdf" or "percent point function".
@@ -982,11 +961,8 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       try:
         return self._variance()
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.square(self._stddev())
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.square(self._stddev())
 
   def _stddev(self):
     raise NotImplementedError("stddev is not implemented")
@@ -1014,11 +990,8 @@ class Distribution(_BaseDistribution):
     with self._name_scope(name):
       try:
         return self._stddev()
-      except NotImplementedError as original_exception:
-        try:
-          return math_ops.sqrt(self._variance())
-        except NotImplementedError:
-          raise original_exception
+      except NotImplementedError:
+        return math_ops.sqrt(self._variance())
 
   def _covariance(self):
     raise NotImplementedError("covariance is not implemented")
-- 
GitLab


From b8b93f363bbefb02e5a79757f1271e0086468261 Mon Sep 17 00:00:00 2001
From: Mark Daoust <markdaoust@google.com>
Date: Tue, 5 Jun 2018 09:38:46 -0700
Subject: [PATCH 308/610] Edit error message to make it clear which yaml module
 you need.

PiperOrigin-RevId: 199310214
---
 tensorflow/python/keras/engine/network.py | 3 ++-
 tensorflow/python/keras/engine/saving.py  | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index d43aba6875..c096669a5f 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -1457,7 +1457,8 @@ class Network(base_layer.Layer):
         ImportError: if yaml module is not found.
     """
     if yaml is None:
-      raise ImportError('Requires yaml module installed.')
+      raise ImportError(
+          'Requires yaml module installed (`pip install pyyaml`).')
     return yaml.dump(self._updated_config(), **kwargs)
 
   def summary(self, line_length=None, positions=None, print_fn=None):
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 99ce64a469..40b693efde 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -323,7 +323,7 @@ def model_from_yaml(yaml_string, custom_objects=None):
       ImportError: if yaml module is not found.
   """
   if yaml is None:
-    raise ImportError('Requires yaml module installed.')
+    raise ImportError('Requires yaml module installed (`pip install pyyaml`).')
   config = yaml.load(yaml_string)
   from tensorflow.python.keras.layers import deserialize  # pylint: disable=g-import-not-at-top
   return deserialize(config, custom_objects=custom_objects)
-- 
GitLab


From 8c9afdf9c6c2e8139e2a0526bc41d5220be3b164 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 09:45:40 -0700
Subject: [PATCH 309/610] Fix docstring formatting.

PiperOrigin-RevId: 199311231
---
 tensorflow/python/estimator/training.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index 522662cd32..fb6a68b4f7 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -295,6 +295,7 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   model will be trained with three epochs of training data instead of one epoch.
 
   Example of local (non-distributed) training:
+
   ```python
   # Set up feature columns.
   categorial_feature_a = categorial_column_with_hash_bucket(...)
@@ -339,12 +340,14 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
 
   Setting environment variable depends on the platform. For example, on Linux,
   it can be done as follows (`$` is the shell prompt):
+
   ```
   $ TF_CONFIG='<replace_with_real_content>' python train_model.py
   ```
 
   For the content in `TF_CONFIG`, assume that the training cluster spec looks
   like:
+
   ```
   cluster = {"chief": ["host0:2222"],
              "worker": ["host1:2222", "host2:2222", "host3:2222"],
@@ -352,6 +355,7 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   ```
 
   Example of `TF_CONFIG` for chief training worker (must have one and only one):
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -371,6 +375,7 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
 
   Example of `TF_CONFIG` for non-chief training worker (optional, could be
   multiple):
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -387,6 +392,7 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   for non-chief training workers.
 
   Example of `TF_CONFIG` for parameter server, aka ps (could be multiple):
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
@@ -405,6 +411,7 @@ def train_and_evaluate(estimator, train_spec, eval_spec):
   Example of `TF_CONFIG` for evaluator task. Evaluator is a special task that is
   not part of the training cluster. There could be only one. It is used for
   model evaluation.
+
   ```
   # This should be a JSON string, which is set as environment variable. Usually
   # the cluster manager handles that.
-- 
GitLab


From c8090fa6acac1f9724671407964662137911921f Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 5 Jun 2018 10:19:49 -0700
Subject: [PATCH 310/610] Internal change.

PiperOrigin-RevId: 199316885
---
 .../lite/tools/benchmark/command_line_flags.cc      |  2 +-
 .../lite/tools/benchmark/command_line_flags_test.cc | 13 +++++++++++++
 tensorflow/core/BUILD                               |  2 ++
 3 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
index 723bf67e03..8195fc44be 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags.cc
@@ -35,7 +35,7 @@ bool ParseFlag(const std::string& arg, const std::string& flag,
   if (arg.find(flag_prefix) != 0) {
     return false;
   }
-  bool has_value = (arg.size() >= flag_prefix.size() + 1);
+  bool has_value = arg.size() >= flag_prefix.size();
   *value_parsing_ok = has_value;
   if (has_value) {
     *value_parsing_ok = parse_func(arg.substr(flag_prefix.size()));
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
index 74cf59105b..9a931d5ddd 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
@@ -53,6 +53,19 @@ TEST(CommandLineFlagsTest, BasicUsage) {
   EXPECT_EQ(argc, 1);
 }
 
+TEST(CommandLineFlagsTest, EmptyStringFlag) {
+  int argc = 2;
+  std::string some_string = "invalid";
+  const char* argv_strings[] = {"program_name", "--some_string="};
+  bool parsed_ok =
+      Flags::Parse(&argc, reinterpret_cast<const char**>(argv_strings),
+                   {Flag("some_string", &some_string, "some string")});
+
+  EXPECT_EQ(true, parsed_ok);
+  EXPECT_EQ(some_string, "");
+  EXPECT_EQ(argc, 1);
+}
+
 TEST(CommandLineFlagsTest, BadIntValue) {
   int some_int = 10;
   int argc = 2;
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 6bde2a0a4a..f5cc6ef2a1 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1439,6 +1439,7 @@ filegroup(
             "lib/png/**/*",
             "lib/gif/**/*",
             "util/events_writer.*",
+            "util/stats_calculator.*",
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
             "platform/default/test_benchmark.*",
@@ -1522,6 +1523,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc_impl",
+        ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
-- 
GitLab


From 13b3439fffad7057755dc88802064cbe4eec7bfa Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 10:28:38 -0700
Subject: [PATCH 311/610] Change order of installations.

---
 tensorflow/tools/ci_build/install/install_pip_packages.sh  | 7 ++++---
 .../ci_build/install/install_python3.5_pip_packages.sh     | 4 +++-
 .../ci_build/install/install_python3.6_pip_packages.sh     | 4 +++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index bd6c50bce9..dba2dfc490 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -21,9 +21,6 @@ set -e
 easy_install -U pip==9.0.3
 easy_install3 -U pip==9.0.3
 
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
-
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
 
@@ -57,6 +54,10 @@ pip3 install --upgrade markdown==2.6.8
 pip2 install --upgrade protobuf==3.3.0
 pip3 install --upgrade protobuf==3.3.0
 
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 0844c48980..e1978cd7d8 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools==39.1.0
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -51,6 +50,9 @@ pip3.5 install --upgrade six==1.10.0
 # Install protobuf.
 pip3.5 install --upgrade protobuf==3.3.0
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index fb183b0e4f..0ffb8e67a4 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,6 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools==39.1.0
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -63,6 +62,9 @@ pip3 install --upgrade six==1.10.0
 # Install protobuf.
 pip3 install --upgrade protobuf==3.3.0
 
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
-- 
GitLab


From 23825b76e508ac3c110d295b63e4e07f2cebbcf8 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 10:31:47 -0700
Subject: [PATCH 312/610] Making setuptools the last install to ensure it's
 accurate.

---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 8 ++++----
 .../ci_build/install/install_python3.5_pip_packages.sh    | 6 +++---
 .../ci_build/install/install_python3.6_pip_packages.sh    | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index dba2dfc490..b3d3f23ec8 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -54,10 +54,6 @@ pip3 install --upgrade markdown==2.6.8
 pip2 install --upgrade protobuf==3.3.0
 pip3 install --upgrade protobuf==3.3.0
 
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
-
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
@@ -113,3 +109,7 @@ pip2 install --upgrade gast
 pip3 install --upgrade gast
 pip2 install --upgrade termcolor
 pip3 install --upgrade termcolor
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e1978cd7d8..61d34c7304 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -50,9 +50,6 @@ pip3.5 install --upgrade six==1.10.0
 # Install protobuf.
 pip3.5 install --upgrade protobuf==3.3.0
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
@@ -84,4 +81,7 @@ pip3.5 install --upgrade astor
 pip3.5 install --upgrade gast
 pip3.5 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 0ffb8e67a4..fe2d2cf11c 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -62,9 +62,6 @@ pip3 install --upgrade six==1.10.0
 # Install protobuf.
 pip3 install --upgrade protobuf==3.3.0
 
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
@@ -100,4 +97,7 @@ pip3 install --upgrade astor
 pip3 install --upgrade gast
 pip3 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
-- 
GitLab


From a7c026e08864417b35dbe3c9e4b246725ad6ba59 Mon Sep 17 00:00:00 2001
From: Anjali Sridhar <anjalisridhar@google.com>
Date: Tue, 5 Jun 2018 10:36:12 -0700
Subject: [PATCH 313/610] Respect name scopes opened in tower mode when
 creating vars in cross tower mode.

PiperOrigin-RevId: 199319758
---
 .../distribute/python/mirrored_strategy.py    | 35 +++++++---
 .../python/mirrored_strategy_multigpu_test.py | 68 +++++++++++++++++++
 2 files changed, 93 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 6eadba976b..cef0a2907b 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -118,7 +118,10 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
           if i > 0:
             # Give replicas meaningful distinct names:
             var0name = index[devices[0]].name.split(":")[0]
-            kwargs["name"] = "%s/replica_%d" % (var0name, i)
+            # We append a / to variable names created on towers with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
             # Initialize replicas with the same value:
             if context.executing_eagerly():
               kwargs["initial_value"] = array_ops.identity(
@@ -258,8 +261,15 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                 {t.device: t.merge_args for t in threads})
             merge_kwargs = values.regroup(
                 {t.device: t.merge_kwargs for t in threads})
-            merge_result = threads[0].merge_fn(
-                self, *merge_args, **merge_kwargs)
+            # We capture the name_scope of the MTT when we call merge_fn
+            # to ensure that if we have opened a name scope in the MTT,
+            # it will be respected when executing the merge function. We only
+            # capture the name_scope from the first MTT and assume it is
+            # the same for all other MTTs.
+            mtt_captured_name_scope = threads[0].captured_name_scope
+            with ops.name_scope(mtt_captured_name_scope):
+              merge_result = threads[0].merge_fn(
+                  self, *merge_args, **merge_kwargs)
             for t in threads:
               t.merge_result = values.select_device(t.device, merge_result)
     finally:
@@ -428,6 +438,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       self.merge_args = None
       self.merge_kwargs = None
       self.merge_result = None
+      self.captured_name_scope = None
       # We use a thread.Event for the main thread to signal when this
       # thread should start running (`should_run`), and another for
       # this thread to transfer control back to the main thread
@@ -451,13 +462,13 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
       self._variable_creator_stack = self.graph._variable_creator_stack[:]
       self._captured_var_scope = variable_scope.get_variable_scope()
       # Adding a "/" at end lets us re-enter this scope later.
-      self._captured_name_scope = self.graph.get_name_scope()
-      if self._captured_name_scope:
-        self._captured_name_scope += "/"
+      self._name_scope = self.graph.get_name_scope()
+      if self._name_scope:
+        self._name_scope += "/"
       if self.tower_id > 0:
-        if not self._captured_name_scope:
-          self._captured_name_scope = ""
-        self._captured_name_scope += "tower_%d/" % self.tower_id
+        if not self._name_scope:
+          self._name_scope = ""
+        self._name_scope += "tower_%d/" % self.tower_id
 
     def run(self):
       # pylint: disable=protected-access
@@ -473,7 +484,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
             _enter_graph(self.graph), \
             MirroredTowerContext(self.distribution, self.tower_id), \
             ops.device(self.device), \
-            ops.name_scope(self._captured_name_scope), \
+            ops.name_scope(self._name_scope), \
             variable_scope.variable_scope(
                 self._captured_var_scope, reuse=self.tower_id > 0), \
             variable_scope.variable_creator_scope(self.variable_creator_fn):
@@ -499,6 +510,10 @@ class MirroredTowerContext(distribute_lib.TowerContext):
     t.merge_fn = fn
     t.merge_args = args
     t.merge_kwargs = kwargs
+    t.captured_name_scope = t.graph.get_name_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    if t.captured_name_scope:
+      t.captured_name_scope += "/"
     t.has_paused.set()
     t.should_run.wait()
     t.should_run.clear()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 3f9a02b249..bccd278847 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -438,6 +438,74 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         self.assertEquals("foo/" + name + ":0", v0.name)
         self.assertEquals("tower_1/foo/" + name + ":0", v1.name)
 
+  # variable_scope.variable() respects name scopes when creating
+  # variables. On the other hand variable_scope.get_variable() ignores name
+  # scopes when creating variables. We test both methods of creating variables
+  # to make sure that we have the same variable names in both cases.
+  def testNameScopeWithVariable(self):
+    def in_cross_tower(_):
+      c = variable_scope.variable(1.0, name="c")
+      return c
+
+    def model_fn():
+      b = variable_scope.variable(1.0, name="b")
+      with ops.name_scope("foo"):
+        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+      return b, c
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        a = variable_scope.variable(1.0, name="a")
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      result_b = result[0]
+      result_c = result[1]
+      self.assertIsInstance(result_b, values.DistributedValues)
+      self.assertIsInstance(result_c, values.DistributedValues)
+      a0, a1 = dist.unwrap(a)
+      b0, b1 = dist.unwrap(result_b)
+      c0, c1 = dist.unwrap(result_c)
+      self.assertEquals("main/a:0", a0.name)
+      self.assertEquals("main/a/replica_1:0", a1.name)
+      self.assertEquals("main/b:0", b0.name)
+      self.assertEquals("main/b/replica_1:0", b1.name)
+      self.assertEquals("main/foo/c:0", c0.name)
+      self.assertEquals("main/foo/c/replica_1:0", c1.name)
+
+  def testNameScopeWithGetVariable(self):
+    def in_cross_tower(_):
+      c = variable_scope.get_variable("c", [1])
+      return c
+
+    def model_fn():
+      b = variable_scope.get_variable("b", [1])
+      with ops.name_scope("foo"):
+        c = distribute_lib.get_tower_context().merge_call(in_cross_tower)
+      return b, c
+
+    dist = mirrored_strategy.MirroredStrategy(
+        ["/device:GPU:0", "/device:CPU:0"])
+
+    with context.graph_mode(), dist.scope():
+      with ops.name_scope("main"):
+        a = variable_scope.get_variable("a", [1])
+        result = dist.call_for_each_tower(model_fn, run_concurrently=False)
+      result_b = result[0]
+      result_c = result[1]
+      self.assertIsInstance(result_b, values.DistributedValues)
+      self.assertIsInstance(result_c, values.DistributedValues)
+      a0, a1 = dist.unwrap(a)
+      b0, b1 = dist.unwrap(result_b)
+      c0, c1 = dist.unwrap(result_c)
+      self.assertEquals("a:0", a0.name)
+      self.assertEquals("a/replica_1:0", a1.name)
+      self.assertEquals("b:0", b0.name)
+      self.assertEquals("b/replica_1:0", b1.name)
+      self.assertEquals("c:0", c0.name)
+      self.assertEquals("c/replica_1:0", c1.name)
+
   def testDynamicRnnVariables(self):
     def model_fn():
       inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
-- 
GitLab


From b2e56707ecbc6dc4b130a50424f5b85956f58720 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 10:43:07 -0700
Subject: [PATCH 314/610] Do not enable tensor ops for cuDNN RNN unless
 explicitly specified.

PiperOrigin-RevId: 199321021
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 55c1083a61..f6564df0d0 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -1031,7 +1031,15 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
                             rnn_mode, direction_mode, num_layers));
 
 #if CUDNN_VERSION >= 7000
-    if (RnnTensorOpMathEnabled()) {
+    // Require explicit algorithm config to enable tensor cores. Some configs
+    // return CUDNN_NOT_SUPPORTED when tensor ops are enabled (which is against
+    // the idiom that enabling tensor ops is only a hint: see nvbugs/2172799).
+    // We can only reasonably expect the user to handle the subsequent failure
+    // in profile mode, which is run with algorithms returned from
+    // GetRnnAlgorithms() (which are non-default and explicitly set whether to
+    // use tensor ops).
+    if (RnnTensorOpMathEnabled() &&
+        !algorithm_config.algorithm().is_default()) {
       cudnnMathType_t math_type =
           algorithm_config.algorithm().tensor_ops_enabled()
               ? CUDNN_TENSOR_OP_MATH
-- 
GitLab


From fdc085f021f98e7f4cba44e716f4f85cb9704447 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 11:11:16 -0700
Subject: [PATCH 315/610] Fixing the adamax_test rtol to be more lenient.

---
 tensorflow/contrib/opt/python/training/adamax_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 21bf3f5313..a059aae130 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,8 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1), rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
-- 
GitLab


From 938d46df199720784555af6dddc339f250b10008 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 11:31:55 -0700
Subject: [PATCH 316/610] Fixing line too long.

---
 tensorflow/contrib/opt/python/training/adamax_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index a059aae130..915e6504e1 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,10 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0), rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1), rtol=1e-2)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
-- 
GitLab


From e86d969c07c14f8790f364d0b48724848db48d4e Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Tue, 5 Jun 2018 11:51:24 -0700
Subject: [PATCH 317/610] Fix bug in which uncompiled tf.keras.Models cannot be
 saved

This bug seems to be specific to tf.keras, i.e., it doesn't happen to keras.

PiperOrigin-RevId: 199334073
---
 tensorflow/python/keras/engine/saving.py      |  2 +-
 tensorflow/python/keras/engine/saving_test.py | 24 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 40b693efde..b9a2e1f25f 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -106,7 +106,7 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
     model_layers = model.layers
     save_weights_to_hdf5_group(model_weights_group, model_layers)
 
-    if include_optimizer and hasattr(model, 'optimizer'):
+    if include_optimizer and model.optimizer:
       if isinstance(model.optimizer, optimizers.TFOptimizer):
         logging.warning(
             'TensorFlow optimizers do not '
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 5abca8a553..1470718a5e 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -288,6 +288,30 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  def test_sequential_model_saving_without_compile(self):
+    if h5py is None:
+      self.skipTest('h5py required to run this test')
+
+    with self.test_session():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(2, input_shape=(3,)))
+      model.add(keras.layers.RepeatVector(3))
+      model.add(keras.layers.TimeDistributed(keras.layers.Dense(3)))
+
+      x = np.random.random((1, 3))
+      out = model.predict(x)
+      fd, fname = tempfile.mkstemp('.h5')
+
+      # Save the model without any compilation or training.
+      keras.models.save_model(model, fname)
+
+      new_model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
+
+      out2 = new_model.predict(x)
+      self.assertAllClose(out, out2, atol=1e-05)
+
   def test_sequential_model_saving_2(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
-- 
GitLab


From b1fd2ef4d02719cd929fa574796b2c080a21a9ee Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 11:54:41 -0700
Subject: [PATCH 318/610] Add core/util/exec_on_stall.h a tool for debugging
 deadlocks with less logging.

PiperOrigin-RevId: 199334548
---
 tensorflow/core/BUILD                      | 31 ++++++--
 tensorflow/core/util/exec_on_stall.h       | 89 ++++++++++++++++++++++
 tensorflow/core/util/exec_on_stall_test.cc | 47 ++++++++++++
 3 files changed, 160 insertions(+), 7 deletions(-)
 create mode 100644 tensorflow/core/util/exec_on_stall.h
 create mode 100644 tensorflow/core/util/exec_on_stall_test.cc

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index f5cc6ef2a1..28af3ce4ea 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -72,24 +72,23 @@ licenses(["notice"])  # Apache 2.0
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "cc_header_only_library",
     "full_path",
     "if_android",
-    "if_not_android_mips_and_mips64",
     "if_ios",
     "if_linux_x86_64",
     "if_mobile",
     "if_not_mobile",
-    "if_windows",
     "if_not_windows",
-    "tf_copts",
+    "if_windows",
     "tf_cc_test",
     "tf_cc_tests",
+    "tf_copts",
     "tf_cuda_library",
     "tf_gen_op_libs",
     "tf_generate_proto_text_sources",
     "tf_genrule_cmd_append_to_srcs",
     "tf_opts_nortti_if_android",
-    "cc_header_only_library",
 )
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_mkl")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
@@ -113,11 +112,11 @@ load(
     "tf_additional_human_readable_json_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
+    "tf_additional_lib_hdrs",
+    "tf_additional_lib_srcs",
     "tf_additional_libdevice_data",
     "tf_additional_libdevice_deps",
     "tf_additional_libdevice_srcs",
-    "tf_additional_lib_hdrs",
-    "tf_additional_lib_srcs",
     "tf_additional_minimal_lib_srcs",
     "tf_additional_mpi_lib_defines",
     "tf_additional_proto_hdrs",
@@ -141,8 +140,8 @@ load(
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
-    "tf_cuda_tests_tags",
     "if_static",
+    "tf_cuda_tests_tags",
 )
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@io_bazel_rules_closure//closure:defs.bzl", "closure_proto_library")
@@ -887,6 +886,12 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "exec_on_stall",
+    hdrs = ["util/exec_on_stall.h"],
+    deps = [":framework_lite"],
+)
+
 cc_library(
     name = "ptr_util",
     hdrs = ["util/ptr_util.h"],
@@ -3252,6 +3257,18 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "exec_on_stall_test",
+    size = "small",
+    srcs = ["util/exec_on_stall_test.cc"],
+    deps = [
+        ":exec_on_stall",
+        ":framework_lite",
+        ":test",
+        ":test_main",
+    ],
+)
+
 tf_cc_test(
     name = "lib_jpeg_jpeg_mem_unittest",
     srcs = ["lib/jpeg/jpeg_mem_unittest.cc"],
diff --git a/tensorflow/core/util/exec_on_stall.h b/tensorflow/core/util/exec_on_stall.h
new file mode 100644
index 0000000000..5c8f9d2324
--- /dev/null
+++ b/tensorflow/core/util/exec_on_stall.h
@@ -0,0 +1,89 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
+#define TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
+
+#include <functional>
+
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// An object that executes a particular function only if it
+// is not deleted within the allotted number of seconds.
+//
+// This can be useful in diagnosing deadlocks, stalls and memory leaks
+// without logging too agressively.
+class ExecuteOnStall {
+ public:
+  // delay_secs: If the object still exists after this many seconds,
+  //     execute f.
+  // f: The function to be executed, for example a detailed log of the
+  //    the state of an object to which this is attached.
+  // poll_microseconds: The spawned thread will wake and test whether
+  //    the destructor has been invoked this frequently.
+  ExecuteOnStall(int delay_secs, std::function<void()> f,
+                 int32 poll_microseconds = 100)
+      : disabled_(false),
+        joined_(false),
+        env_(Env::Default()),
+        f_(f),
+        poll_microseconds_(poll_microseconds) {
+    deadline_ = env_->NowMicros() + 1000000 * delay_secs;
+    env_->SchedClosure([this]() {
+      while (env_->NowMicros() < deadline_) {
+        {
+          mutex_lock l(mu_);
+          if (disabled_) {
+            break;
+          }
+        }
+        env_->SleepForMicroseconds(poll_microseconds_);
+      }
+      {
+        mutex_lock l(mu_);
+        if (!disabled_) {
+          f_();
+        }
+        joined_ = true;
+        cond_var_.notify_all();
+      }
+    });
+  }
+
+  ~ExecuteOnStall() {
+    // Wait for spawned thread to terminate.
+    mutex_lock l(mu_);
+    disabled_ = true;
+    if (!joined_) {
+      cond_var_.wait(l);
+    }
+  }
+
+ private:
+  mutex mu_;
+  condition_variable cond_var_;
+  bool disabled_ GUARDED_BY(mu_);
+  bool joined_ GUARDED_BY(mu_);
+  Env* env_;
+  std::function<void()> f_;
+  int64 deadline_;
+  int32 poll_microseconds_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_UTIL_EXEC_ON_STALL_H_
diff --git a/tensorflow/core/util/exec_on_stall_test.cc b/tensorflow/core/util/exec_on_stall_test.cc
new file mode 100644
index 0000000000..df8118d611
--- /dev/null
+++ b/tensorflow/core/util/exec_on_stall_test.cc
@@ -0,0 +1,47 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/exec_on_stall.h"
+
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+struct Chunk {
+  std::unique_ptr<ExecuteOnStall> stall_closure;
+};
+
+Chunk* NewChunk(int stall_seconds, std::function<void()> f) {
+  Chunk* c = new Chunk;
+  c->stall_closure.reset(new ExecuteOnStall(stall_seconds, std::move(f)));
+  return c;
+}
+
+TEST(ExecuteOnStallTest, BothWays) {
+  bool a_triggered = false;
+  bool b_triggered = false;
+  Chunk* a = NewChunk(1, [&a_triggered]() { a_triggered = true; });
+  Chunk* b = NewChunk(1, [&b_triggered]() { b_triggered = true; });
+  delete a;
+  Env::Default()->SleepForMicroseconds(2000000);
+  EXPECT_FALSE(a_triggered);
+  EXPECT_TRUE(b_triggered);
+  delete b;
+}
+
+}  // namespace
+}  // namespace tensorflow
-- 
GitLab


From 62a70dd873bc8488b10df5ad55254119173a5d0c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 11:58:16 -0700
Subject: [PATCH 319/610] Extend and refactor reader_ops_test

PiperOrigin-RevId: 199335030
---
 .../python/kernel_tests/reader_ops_test.py    | 352 ++++++++----------
 1 file changed, 163 insertions(+), 189 deletions(-)

diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index 82a27eebee..7be473a5e7 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -77,6 +77,69 @@ _TEXT = b"""Gaily bedight,
     """
 
 
+class TFCompressionTestCase(test.TestCase):
+
+  def setUp(self):
+    super(TFCompressionTestCase, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+  def _Record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _CreateFiles(self, options=None, prefix=""):
+    filenames = []
+    for i in range(self._num_files):
+      name = prefix + "tfrecord.%d.txt" % i
+      records = [self._Record(i, j) for j in range(self._num_records)]
+      fn = self._WriteRecordsToFile(records, name, options)
+      filenames.append(fn)
+    return filenames
+
+  def _WriteRecordsToFile(self, records, name="tfrecord", options=None):
+    fn = os.path.join(self.get_temp_dir(), name)
+    with tf_record.TFRecordWriter(fn, options=options) as writer:
+      for r in records:
+        writer.write(r)
+    return fn
+
+  def _ZlibCompressFile(self, infile, name="tfrecord.z"):
+    # zlib compress the file and write compressed contents to file.
+    with open(infile, "rb") as f:
+      cdata = zlib.compress(f.read())
+
+    zfn = os.path.join(self.get_temp_dir(), name)
+    with open(zfn, "wb") as f:
+      f.write(cdata)
+    return zfn
+
+  def _GzipCompressFile(self, infile, name="tfrecord.gz"):
+    # gzip compress the file and write compressed contents to file.
+    with open(infile, "rb") as f:
+      cdata = f.read()
+
+    gzfn = os.path.join(self.get_temp_dir(), name)
+    with gzip.GzipFile(gzfn, "wb") as f:
+      f.write(cdata)
+    return gzfn
+
+  def _ZlibDecompressFile(self, infile, name="tfrecord"):
+    with open(infile, "rb") as f:
+      cdata = zlib.decompress(f.read())
+    fn = os.path.join(self.get_temp_dir(), name)
+    with open(fn, "wb") as f:
+      f.write(cdata)
+    return fn
+
+  def _GzipDecompressFile(self, infile, name="tfrecord"):
+    with gzip.GzipFile(infile, "rb") as f:
+      cdata = f.read()
+    fn = os.path.join(self.get_temp_dir(), name)
+    with open(fn, "wb") as f:
+      f.write(cdata)
+    return fn
+
+
 class IdentityReaderTest(test.TestCase):
 
   def _ExpectRead(self, sess, key, value, expected):
@@ -348,7 +411,7 @@ class TextLineReaderTest(test.TestCase):
         k, v = sess.run([key, value])
 
 
-class FixedLengthRecordReaderTest(test.TestCase):
+class FixedLengthRecordReaderTest(TFCompressionTestCase):
 
   def setUp(self):
     super(FixedLengthRecordReaderTest, self).setUp()
@@ -407,40 +470,18 @@ class FixedLengthRecordReaderTest(test.TestCase):
 
   # gap_bytes=hop_bytes-record_bytes
   def _CreateGzipFiles(self, num_records, gap_bytes):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-      with gzip.GzipFile(fn, "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        if num_records > 0:
-          f.write(self._Record(i, 0))
-        for j in range(1, num_records):
-          if gap_bytes > 0:
-            f.write(b"G" * gap_bytes)
-          f.write(self._Record(i, j))
-        f.write(b"F" * self._footer_bytes)
+    filenames = self._CreateFiles(num_records, gap_bytes)
+    for fn in filenames:
+      # compress inplace.
+      self._GzipCompressFile(fn, fn)
     return filenames
 
   # gap_bytes=hop_bytes-record_bytes
   def _CreateZlibFiles(self, num_records, gap_bytes):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-      with open(fn + ".tmp", "wb") as f:
-        f.write(b"H" * self._header_bytes)
-        if num_records > 0:
-          f.write(self._Record(i, 0))
-        for j in range(1, num_records):
-          if gap_bytes > 0:
-            f.write(b"G" * gap_bytes)
-          f.write(self._Record(i, j))
-        f.write(b"F" * self._footer_bytes)
-      with open(fn + ".tmp", "rb") as f:
-        cdata = zlib.compress(f.read())
-        with open(fn, "wb") as zf:
-          zf.write(cdata)
+    filenames = self._CreateFiles(num_records, gap_bytes)
+    for fn in filenames:
+      # compress inplace.
+      self._ZlibCompressFile(fn, fn)
     return filenames
 
   def _CreateGzipOverlappedRecordFiles(self, num_overlapped_records):
@@ -477,10 +518,7 @@ class FixedLengthRecordReaderTest(test.TestCase):
           ])
           f.write(compat.as_bytes(all_records_str))
         f.write(b"F" * self._footer_bytes)
-      with open(fn + ".tmp", "rb") as f:
-        cdata = zlib.compress(f.read())
-        with open(fn, "wb") as zf:
-          zf.write(cdata)
+      self._ZlibCompressFile(fn + ".tmp", fn)
     return filenames
 
   # gap_bytes=hop_bytes-record_bytes
@@ -529,7 +567,6 @@ class FixedLengthRecordReaderTest(test.TestCase):
       for i in range(self._num_files):
         for j in range(num_overlapped_records):
           k, v = sess.run([key, value])
-          print(v)
           self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
           self.assertAllEqual(self._OverlappedRecord(i, j), v)
 
@@ -579,25 +616,10 @@ class FixedLengthRecordReaderTest(test.TestCase):
           files, num_overlapped_records, encoding="ZLIB")
 
 
-class TFRecordReaderTest(test.TestCase):
+class TFRecordReaderTest(TFCompressionTestCase):
 
   def setUp(self):
     super(TFRecordReaderTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-
-  def _Record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
-
-  def _CreateFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      writer = tf_record.TFRecordWriter(fn)
-      for j in range(self._num_records):
-        writer.write(self._Record(i, j))
-    return filenames
 
   def testOneEpoch(self):
     files = self._CreateFiles()
@@ -647,107 +669,106 @@ class TFRecordReaderTest(test.TestCase):
       self.assertEqual(self._num_files * self._num_records, num_v)
 
   def testReadZlibFiles(self):
-    files = self._CreateFiles()
-    zlib_files = []
-    for i, fn in enumerate(files):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    files = self._CreateFiles(options)
 
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
 
-      queue.enqueue_many([zlib_files]).run()
+      queue.enqueue_many([files]).run()
       queue.close().run()
       for i in range(self._num_files):
         for j in range(self._num_records):
           k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % zlib_files[i]))
+          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
           self.assertAllEqual(self._Record(i, j), v)
 
   def testReadGzipFiles(self):
-    files = self._CreateFiles()
-    gzip_files = []
-    for i, fn in enumerate(files):
-      with open(fn, "rb") as f:
-        cdata = f.read()
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(zfn, "wb") as f:
-          f.write(cdata)
-        gzip_files.append(zfn)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
+    files = self._CreateFiles(options)
 
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
 
-      queue.enqueue_many([gzip_files]).run()
+      queue.enqueue_many([files]).run()
       queue.close().run()
       for i in range(self._num_files):
         for j in range(self._num_records):
           k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % gzip_files[i]))
+          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
           self.assertAllEqual(self._Record(i, j), v)
 
 
-class TFRecordWriterZlibTest(test.TestCase):
+class TFRecordWriterTest(TFCompressionTestCase):
 
   def setUp(self):
-    super(TFRecordWriterZlibTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
+    super(TFRecordWriterTest, self).setUp()
+
+  def _AssertFilesEqual(self, a, b, equal):
+    for an, bn in zip(a, b):
+      with open(an, "rb") as af, open(bn, "rb") as bf:
+        if equal:
+          self.assertEqual(af.read(), bf.read())
+        else:
+          self.assertNotEqual(af.read(), bf.read())
+
+  def testWriteReadZLibFiles(self):
+    # Write uncompressed then compress manually.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE)
+    files = self._CreateFiles(options, prefix="uncompressed")
+    zlib_files = [
+        self._ZlibCompressFile(fn, "tfrecord_%s.z" % i)
+        for i, fn in enumerate(files)
+    ]
+    self._AssertFilesEqual(files, zlib_files, False)
 
-  def _Record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
+    # Now write compressd and verify same.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    compressed_files = self._CreateFiles(options, prefix="compressed")
+    self._AssertFilesEqual(compressed_files, zlib_files, True)
 
-  def _CreateFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      options = tf_record.TFRecordOptions(
-          compression_type=TFRecordCompressionType.ZLIB)
-      writer = tf_record.TFRecordWriter(fn, options=options)
-      for j in range(self._num_records):
-        writer.write(self._Record(i, j))
-      writer.close()
-      del writer
+    # Decompress compress and verify same.
+    uncompressed_files = [
+        self._ZlibDecompressFile(fn, "tfrecord_%s.z" % i)
+        for i, fn in enumerate(compressed_files)
+    ]
+    self._AssertFilesEqual(uncompressed_files, files, True)
+
+  def testWriteReadGzipFiles(self):
+    # Write uncompressed then compress manually.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.NONE)
+    files = self._CreateFiles(options, prefix="uncompressed")
+    gzip_files = [
+        self._GzipCompressFile(fn, "tfrecord_%s.gz" % i)
+        for i, fn in enumerate(files)
+    ]
+    self._AssertFilesEqual(files, gzip_files, False)
 
-    return filenames
+    # Now write compressd and verify same.
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
+    compressed_files = self._CreateFiles(options, prefix="compressed")
 
-  def _WriteRecordsToFile(self, records, name="tf_record"):
-    fn = os.path.join(self.get_temp_dir(), name)
-    writer = tf_record.TFRecordWriter(fn, options=None)
-    for r in records:
-      writer.write(r)
-    writer.close()
-    del writer
-    return fn
+    # Note: Gzips written by TFRecordWriter add 'tfrecord_0' so
+    # compressed_files can't be compared with gzip_files
 
-  def _ZlibCompressFile(self, infile, name="tfrecord.z"):
-    # zlib compress the file and write compressed contents to file.
-    with open(infile, "rb") as f:
-      cdata = zlib.compress(f.read())
+    # Decompress compress and verify same.
+    uncompressed_files = [
+        self._GzipDecompressFile(fn, "tfrecord_%s.gz" % i)
+        for i, fn in enumerate(compressed_files)
+    ]
+    self._AssertFilesEqual(uncompressed_files, files, True)
 
-    zfn = os.path.join(self.get_temp_dir(), name)
-    with open(zfn, "wb") as f:
-      f.write(cdata)
-    return zfn
+
+class TFRecordWriterZlibTest(TFCompressionTestCase):
 
   def testOneEpoch(self):
-    files = self._CreateFiles()
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    files = self._CreateFiles(options)
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(
-          compression_type=TFRecordCompressionType.ZLIB)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -788,8 +809,7 @@ class TFRecordWriterZlibTest(test.TestCase):
       h.write(output)
 
     with self.test_session() as sess:
-      options = tf_record.TFRecordOptions(
-          compression_type=TFRecordCompressionType.ZLIB)
+      options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
       reader = io_ops.TFRecordReader(name="test_reader", options=options)
       queue = data_flow_ops.FIFOQueue(1, [dtypes.string], shapes=())
       key, value = reader.read(queue)
@@ -808,9 +828,7 @@ class TFRecordWriterZlibTest(test.TestCase):
     # read the compressed contents and verify.
     actual = []
     for r in tf_record.tf_record_iterator(
-        zfn,
-        options=tf_record.TFRecordOptions(
-            tf_record.TFRecordCompressionType.ZLIB)):
+        zfn, options=tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)):
       actual.append(r)
     self.assertEqual(actual, original)
 
@@ -822,12 +840,9 @@ class TFRecordWriterZlibTest(test.TestCase):
     fn = self._WriteRecordsToFile(original, "zlib_read_write_large.tfrecord")
     zfn = self._ZlibCompressFile(fn, "zlib_read_write_large.tfrecord.z")
 
-    # read the compressed contents and verify.
     actual = []
     for r in tf_record.tf_record_iterator(
-        zfn,
-        options=tf_record.TFRecordOptions(
-            tf_record.TFRecordCompressionType.ZLIB)):
+        zfn, options=tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)):
       actual.append(r)
     self.assertEqual(actual, original)
 
@@ -835,13 +850,7 @@ class TFRecordWriterZlibTest(test.TestCase):
     """Verify that files produced are gzip compatible."""
     original = [b"foo", b"bar"]
     fn = self._WriteRecordsToFile(original, "gzip_read_write.tfrecord")
-
-    # gzip compress the file and write compressed contents to file.
-    with open(fn, "rb") as f:
-      cdata = f.read()
-    gzfn = os.path.join(self.get_temp_dir(), "tf_record.gz")
-    with gzip.GzipFile(gzfn, "wb") as f:
-      f.write(cdata)
+    gzfn = self._GzipCompressFile(fn, "tfrecord.gz")
 
     actual = []
     for r in tf_record.tf_record_iterator(
@@ -850,89 +859,54 @@ class TFRecordWriterZlibTest(test.TestCase):
     self.assertEqual(actual, original)
 
 
-class TFRecordIteratorTest(test.TestCase):
+class TFRecordIteratorTest(TFCompressionTestCase):
 
   def setUp(self):
     super(TFRecordIteratorTest, self).setUp()
     self._num_records = 7
 
-  def _Record(self, r):
-    return compat.as_bytes("Record %d" % r)
-
-  def _WriteCompressedRecordsToFile(
-      self,
-      records,
-      name="tfrecord.z",
-      compression_type=tf_record.TFRecordCompressionType.ZLIB):
-    fn = os.path.join(self.get_temp_dir(), name)
-    options = tf_record.TFRecordOptions(compression_type=compression_type)
-    writer = tf_record.TFRecordWriter(fn, options=options)
-    for r in records:
-      writer.write(r)
-    writer.close()
-    del writer
-    return fn
-
-  def _ZlibDecompressFile(self, infile, name="tfrecord", wbits=zlib.MAX_WBITS):
-    with open(infile, "rb") as f:
-      cdata = zlib.decompress(f.read(), wbits)
-    zfn = os.path.join(self.get_temp_dir(), name)
-    with open(zfn, "wb") as f:
-      f.write(cdata)
-    return zfn
-
   def testIterator(self):
-    fn = self._WriteCompressedRecordsToFile(
-        [self._Record(i) for i in range(self._num_records)],
-        "compressed_records")
-    options = tf_record.TFRecordOptions(
-        compression_type=TFRecordCompressionType.ZLIB)
+    records = [self._Record(0, i) for i in range(self._num_records)]
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    fn = self._WriteRecordsToFile(records, "compressed_records", options)
+
     reader = tf_record.tf_record_iterator(fn, options)
-    for i in range(self._num_records):
+    for expected in records:
       record = next(reader)
-      self.assertAllEqual(self._Record(i), record)
+      self.assertAllEqual(expected, record)
     with self.assertRaises(StopIteration):
       record = next(reader)
 
   def testWriteZlibRead(self):
     """Verify compression with TFRecordWriter is zlib library compatible."""
     original = [b"foo", b"bar"]
-    fn = self._WriteCompressedRecordsToFile(original,
-                                            "write_zlib_read.tfrecord.z")
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    fn = self._WriteRecordsToFile(original, "write_zlib_read.tfrecord.z",
+                                  options)
+
     zfn = self._ZlibDecompressFile(fn, "write_zlib_read.tfrecord")
-    actual = []
-    for r in tf_record.tf_record_iterator(zfn):
-      actual.append(r)
+    actual = list(tf_record.tf_record_iterator(zfn))
     self.assertEqual(actual, original)
 
   def testWriteZlibReadLarge(self):
     """Verify compression for large records is zlib library compatible."""
     # Make it large (about 5MB)
     original = [_TEXT * 10240]
-    fn = self._WriteCompressedRecordsToFile(original,
-                                            "write_zlib_read_large.tfrecord.z")
-    zfn = self._ZlibDecompressFile(fn, "write_zlib_read_large.tf_record")
-    actual = []
-    for r in tf_record.tf_record_iterator(zfn):
-      actual.append(r)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
+    fn = self._WriteRecordsToFile(original, "write_zlib_read_large.tfrecord.z",
+                                  options)
+    zfn = self._ZlibDecompressFile(fn, "write_zlib_read_large.tfrecord")
+    actual = list(tf_record.tf_record_iterator(zfn))
     self.assertEqual(actual, original)
 
   def testWriteGzipRead(self):
     original = [b"foo", b"bar"]
-    fn = self._WriteCompressedRecordsToFile(
-        original,
-        "write_gzip_read.tfrecord.gz",
-        compression_type=TFRecordCompressionType.GZIP)
-
-    with gzip.GzipFile(fn, "rb") as f:
-      cdata = f.read()
-    zfn = os.path.join(self.get_temp_dir(), "tf_record")
-    with open(zfn, "wb") as f:
-      f.write(cdata)
+    options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
+    fn = self._WriteRecordsToFile(original, "write_gzip_read.tfrecord.gz",
+                                  options)
 
-    actual = []
-    for r in tf_record.tf_record_iterator(zfn):
-      actual.append(r)
+    gzfn = self._GzipDecompressFile(fn, "write_gzip_read.tfrecord")
+    actual = list(tf_record.tf_record_iterator(gzfn))
     self.assertEqual(actual, original)
 
   def testBadFile(self):
-- 
GitLab


From 920df27282b3f5d03d79f54ef05cea305c2a30d7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 12:11:17 -0700
Subject: [PATCH 320/610] Implementation of the symmetrically quantized LSTM
 TFLite Op.

PiperOrigin-RevId: 199337082
---
 .../lite/kernels/internal/kernel_utils.cc     |  262 ++-
 .../lite/kernels/internal/kernel_utils.h      |   83 +
 tensorflow/contrib/lite/kernels/lstm.cc       |  454 ++++-
 tensorflow/contrib/lite/kernels/lstm_test.cc  | 1769 ++++++++++-------
 4 files changed, 1791 insertions(+), 777 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 67e3810479..6e62183975 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -63,6 +63,8 @@ void RnnBatchStep(const float* input_ptr_batch, const int8_t* input_weights_ptr,
     // Quantize input from float to uint8 + quantization params (scaling
     // factor).
     float unused_min, unused_max;
+    // TODO(mirkov,raziel): replace this for-loop with a MACRO (or function)
+    // whichever is faster.
     for (int b = 0; b < batch_size; ++b) {
       const int offset = b * input_size;
       tensor_utils::SymmetricQuantizeFloats(
@@ -147,6 +149,7 @@ void LstmStep(
         input_to_input_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
         input_gate_scratch, /*result_stride=*/1);
   }
+
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       input_to_forget_weights_ptr, n_cell, n_input, input_ptr_batch, n_batch,
       forget_gate_scratch, /*result_stride=*/1);
@@ -161,8 +164,7 @@ void LstmStep(
   if (!use_cifg) {
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         recurrent_to_input_weights_ptr, n_cell, n_output, output_state_ptr,
-        n_batch, input_gate_scratch,
-        /*result_stride=*/1);
+        n_batch, input_gate_scratch, /*result_stride=*/1);
   }
   tensor_utils::MatrixBatchVectorMultiplyAccumulate(
       recurrent_to_forget_weights_ptr, n_cell, n_output, output_state_ptr,
@@ -253,5 +255,261 @@ void LstmStep(
                            output_state_ptr);
 }
 
+// TODO(alanchiao): move this to tensor_utils.
+void VectorMultiply(const int8_t* vector, const int v_size, const float scale,
+                    float* result) {
+  for (int i = 0; i < v_size; ++i) {
+    *result++ = scale * *vector++;
+  }
+}
+
+void LstmStep(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
+    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
+    float* product_scaling_factors, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
+    float* cell_state_ptr, float* output_ptr_batch) {
+  // Since we have already checked that weights are all there or none, we can
+  // check the existense of only one to the get the condition.
+  const bool use_cifg = (input_to_input_weights_ptr == nullptr);
+  const bool use_peephole = (cell_to_output_weights_ptr != nullptr);
+  // Initialize scratch buffers with bias.
+  if (!use_cifg) {
+    tensor_utils::VectorBatchVectorAssign(input_gate_bias_ptr, n_cell, n_batch,
+                                          input_gate_scratch);
+  }
+  tensor_utils::VectorBatchVectorAssign(forget_gate_bias_ptr, n_cell, n_batch,
+                                        forget_gate_scratch);
+  tensor_utils::VectorBatchVectorAssign(cell_bias_ptr, n_cell, n_batch,
+                                        cell_scratch);
+  tensor_utils::VectorBatchVectorAssign(output_gate_bias_ptr, n_cell, n_batch,
+                                        output_gate_scratch);
+
+  if (!tensor_utils::IsZeroVector(input_ptr_batch, n_batch * n_input)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_input;
+      tensor_utils::SymmetricQuantizeFloats(
+          input_ptr_batch + offset, n_input, quantized_input_ptr_batch + offset,
+          &unused_min, &unused_max, &scaling_factors[b]);
+    }
+    // For each batch and cell: compute input_weight * input.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * input_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          input_to_input_weights_ptr, n_cell, n_input,
+          quantized_input_ptr_batch, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_forget_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, forget_gate_scratch,
+        /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_cell_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * input_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
+        product_scaling_factors, n_batch, output_gate_scratch,
+        /*result_stride=*/1);
+  }
+
+  if (!tensor_utils::IsZeroVector(output_state_ptr, n_batch * n_output)) {
+    // Save quantization and matmul computation for all zero input.
+    float unused_min, unused_max;
+    for (int b = 0; b < n_batch; ++b) {
+      const int offset = b * n_output;
+      tensor_utils::SymmetricQuantizeFloats(output_state_ptr + offset, n_output,
+                                            quantized_output_state_ptr + offset,
+                                            &unused_min, &unused_max,
+                                            &scaling_factors[b]);
+    }
+    // For each batch and cell: compute recurrent_weight * output_state.
+    if (!use_cifg) {
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * recurrent_to_input_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          recurrent_to_input_weights_ptr, n_cell, n_output,
+          quantized_output_state_ptr, product_scaling_factors, n_batch,
+          input_gate_scratch, /*result_stride=*/1);
+    }
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_forget_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_forget_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        forget_gate_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_cell_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_cell_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        cell_scratch, /*result_stride=*/1);
+
+    for (int b = 0; b < n_batch; ++b) {
+      product_scaling_factors[b] =
+          scaling_factors[b] * recurrent_to_output_weights_scale;
+    }
+    tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+        recurrent_to_output_weights_ptr, n_cell, n_output,
+        quantized_output_state_ptr, product_scaling_factors, n_batch,
+        output_gate_scratch, /*result_stride=*/1);
+  }
+
+  // Save quantization and matmul computation for all zero input.
+  const bool is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
+
+  // For each batch and cell: update input gate.
+  if (!use_cifg) {
+    if (use_peephole && !is_cell_state_all_zeros) {
+      VectorMultiply(cell_to_input_weights_ptr, n_cell,
+                     1. / cell_to_input_weights_scale, recovered_cell_weights);
+      tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+          recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+          input_gate_scratch);
+    }
+    tensor_utils::ApplySigmoidToVector(input_gate_scratch, n_cell * n_batch,
+                                       input_gate_scratch);
+  }
+
+  // For each batch and cell: update forget gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    VectorMultiply(cell_to_forget_weights_ptr, n_cell,
+                   1. / cell_to_forget_weights_scale, recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        forget_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(forget_gate_scratch, n_cell * n_batch,
+                                     forget_gate_scratch);
+
+  // For each batch and cell: update the cell.
+  tensor_utils::VectorVectorCwiseProduct(forget_gate_scratch, cell_state_ptr,
+                                         n_batch * n_cell, cell_state_ptr);
+  tensor_utils::ApplyActivationToVector(cell_scratch, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  if (use_cifg) {
+    tensor_utils::Sub1Vector(forget_gate_scratch, n_batch * n_cell,
+                             forget_gate_scratch);
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, forget_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  } else {
+    tensor_utils::VectorVectorCwiseProductAccumulate(
+        cell_scratch, input_gate_scratch, n_batch * n_cell, cell_state_ptr);
+  }
+  if (params->cell_clip > 0.0) {
+    tensor_utils::ClipVector(cell_state_ptr, n_batch * n_cell,
+                             params->cell_clip, cell_state_ptr);
+  }
+
+  // For each batch and cell: update the output gate.
+  if (use_peephole && !is_cell_state_all_zeros) {
+    VectorMultiply(cell_to_output_weights_ptr, n_cell,
+                   1. / cell_to_output_weights_scale, recovered_cell_weights);
+    tensor_utils::VectorBatchVectorCwiseProductAccumulate(
+        recovered_cell_weights, n_cell, cell_state_ptr, n_batch,
+        output_gate_scratch);
+  }
+  tensor_utils::ApplySigmoidToVector(output_gate_scratch, n_batch * n_cell,
+                                     output_gate_scratch);
+  tensor_utils::ApplyActivationToVector(cell_state_ptr, n_batch * n_cell,
+                                        params->activation, cell_scratch);
+  tensor_utils::VectorVectorCwiseProduct(output_gate_scratch, cell_scratch,
+                                         n_batch * n_cell, output_gate_scratch);
+
+  // For each batch: update the projection and output_state.
+  const bool use_projection_weight = (projection_weights_ptr != nullptr);
+  const bool use_projection_bias = (projection_bias_ptr != nullptr);
+  if (use_projection_weight) {
+    if (use_projection_bias) {
+      tensor_utils::VectorBatchVectorAssign(projection_bias_ptr, n_output,
+                                            n_batch, output_ptr_batch);
+    } else {
+      tensor_utils::ZeroVector(output_ptr_batch, n_batch * n_output);
+    }
+    if (!tensor_utils::IsZeroVector(output_gate_scratch, n_batch * n_cell)) {
+      // Save quantization and matmul computation for all zero input.
+      float unused_min, unused_max;
+      for (int b = 0; b < n_batch; ++b) {
+        const int offset = b * n_cell;
+        tensor_utils::SymmetricQuantizeFloats(
+            output_gate_scratch + offset, n_cell,
+            quantized_cell_state_ptr + offset, &unused_min, &unused_max,
+            &scaling_factors[b]);
+      }
+      for (int b = 0; b < n_batch; ++b) {
+        product_scaling_factors[b] =
+            scaling_factors[b] * projection_weights_scale;
+      }
+      tensor_utils::MatrixBatchVectorMultiplyAccumulate(
+          projection_weights_ptr, n_output, n_cell, quantized_cell_state_ptr,
+          product_scaling_factors, n_batch, output_ptr_batch,
+          /*result_stride=*/1);
+    }
+    if (params->proj_clip > 0.0) {
+      tensor_utils::ClipVector(output_ptr_batch, n_batch * n_output,
+                               params->proj_clip, output_ptr_batch);
+    }
+  } else {
+    tensor_utils::CopyVector(output_gate_scratch, n_batch * n_output,
+                             output_ptr_batch);
+  }
+  tensor_utils::CopyVector(output_ptr_batch, n_batch * n_output,
+                           output_state_ptr);
+}
+
 }  // namespace kernel_utils
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
index f3f42f0840..2a11b37a60 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.h
@@ -92,6 +92,89 @@ void LstmStep(
     float* forget_gate_scratch, float* cell_scratch, float* output_gate_scratch,
     float* output_ptr_batch);
 
+// Same as above but with quantized weight matrices. In detail:
+// Input of size 'n_batch * n_input':
+//   input_ptr_batch
+//
+// LSTM weights:
+// Quantized input weights of size 'n_cell * n_input':
+//   input_to_input_weights            - optional (can be nullptr)
+//   input_to_forget_weights
+//   input_to_cell_weights
+//   input_to_input_weights
+// Quantized recurrent weights of size 'n_cell * n_output':
+//   recurrent_to_input_weights        - optional
+//   recurrent_to_forget_weights
+//   recurrent_to_cell_weights
+//   recurrent_to_input_weights
+// Quantized peephole weights of size 'n_cell', representing diagonal matrices.
+//   cell_to_input_weights             - optional
+//   cell_to_cell_weights              - optional
+//   cell_to_output_weights            - optional
+// Quantized projection weights of size 'n_output * n_cell'
+//   projection_weights_ptr            - optional
+// Weight scales (scalars) for each of the weights above.
+//   input_to_input_weights_scale      - optional
+//   input_to_forget_weights_scale
+//   input_to_cell_weights_scale
+//   input_to_output_weights_scale
+//   recurrent_to_input_weights_scale  - optional
+//   recurrent_to_forget_weights_scale
+//   recurrent_to_cell_weights_scale
+//   recurrent_to_output_weights_scale
+//   cell_to_input_weights_scale,
+//   cell_to_forget_weights_scale,
+//   cell_to_output_weights_scale,
+//   projection_weights_scale          - optional
+// Gate biases of size 'n_cell':
+//   input_gate_bias_ptr               - optional
+//   forget_gate_bias_ptr
+//   cell_gate_bias_ptr
+//   output_gate_bias_ptr
+//
+// Temporary pre-allocated storage for quantized values:
+//   quantized_input_ptr_batch (same size as input_ptr_batch)
+//   quantized_output_state_ptr (same size as output_state_ptr)
+//   quantized_cell_state_ptr (same size as cell_state_ptr)
+// Temporary pre-allocated storage for recovered values:
+//   recovered_cell_weights (same size as cell_to_*_weights)
+//
+// Outputs:
+//   output_state_ptr - size 'n_batch * n_output'
+//   cell_state_ptr   - size 'n_batch * n_cell'
+//   output_ptr_batch - size 'n_batch * n_output'
+void LstmStep(
+    const float* input_ptr_batch, const int8_t* input_to_input_weights_ptr,
+    float input_to_input_weights_scale,
+    const int8_t* input_to_forget_weights_ptr,
+    float input_to_forget_weights_scale,
+    const int8_t* input_to_cell_weights_ptr, float input_to_cell_weights_scale,
+    const int8_t* input_to_output_weights_ptr,
+    float input_to_output_weights_scale,
+    const int8_t* recurrent_to_input_weights_ptr,
+    float recurrent_to_input_weights_scale,
+    const int8_t* recurrent_to_forget_weights_ptr,
+    float recurrent_to_forget_weights_scale,
+    const int8_t* recurrent_to_cell_weights_ptr,
+    float recurrent_to_cell_weights_scale,
+    const int8_t* recurrent_to_output_weights_ptr,
+    float recurrent_to_output_weights_scale,
+    const int8_t* cell_to_input_weights_ptr, float cell_to_input_weights_scale,
+    const int8_t* cell_to_forget_weights_ptr,
+    float cell_to_forget_weights_scale,
+    const int8_t* cell_to_output_weights_ptr,
+    float cell_to_output_weights_scale, const float* input_gate_bias_ptr,
+    const float* forget_gate_bias_ptr, const float* cell_bias_ptr,
+    const float* output_gate_bias_ptr, const int8_t* projection_weights_ptr,
+    float projection_weights_scale, const float* projection_bias_ptr,
+    const TfLiteLSTMParams* params, int n_batch, int n_cell, int n_input,
+    int n_output, float* input_gate_scratch, float* forget_gate_scratch,
+    float* cell_scratch, float* output_gate_scratch, float* scaling_factors,
+    float* product_scaling_factors, float* recovered_cell_weights,
+    int8_t* quantized_input_ptr_batch, int8_t* quantized_output_state_ptr,
+    int8_t* quantized_cell_state_ptr, float* output_state_ptr,
+    float* cell_state_ptr, float* output_ptr_batch);
+
 }  // namespace kernel_utils
 }  // namespace tflite
 #endif  // TENSORFLOW_CONTRIB_LITE_KERNELS_INTERNAL_KERNEL_UTILS_H_
diff --git a/tensorflow/contrib/lite/kernels/lstm.cc b/tensorflow/contrib/lite/kernels/lstm.cc
index 9aae3e571b..eb26a02455 100644
--- a/tensorflow/contrib/lite/kernels/lstm.cc
+++ b/tensorflow/contrib/lite/kernels/lstm.cc
@@ -86,7 +86,8 @@ constexpr int kOutputTensor = 2;
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   auto* op_data = new OpData;
   op_data->kernel_type = kTfLiteLSTMFullKernel;
-  context->AddTensors(context, 1, &op_data->scratch_tensor_index);
+  context->AddTensors(context, /*tensors_to_add=*/7,
+                      &op_data->scratch_tensor_index);
   return op_data;
 }
 
@@ -94,7 +95,7 @@ void* Init(TfLiteContext* context, const char* buffer, size_t length) {
 TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
                                         TfLiteNode* node, int n_input,
                                         int n_output, int n_cell) {
-  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
 
   // Making sure clipping parameters have valid values.
   // == 0 means no clipping
@@ -104,7 +105,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights) {
+  if (input_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -124,7 +125,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights) {
+  if (recurrent_to_input_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -214,7 +215,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* projection_weights =
       GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  if (projection_weights) {
+  if (projection_weights != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[0], n_output);
     TF_LITE_ENSURE_EQ(context, projection_weights->dims->data[1], n_cell);
@@ -222,7 +223,7 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* projection_bias =
       GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-  if (projection_bias) {
+  if (projection_bias != nullptr) {
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->size, 1);
     TF_LITE_ENSURE_EQ(context, projection_bias->dims->data[0], n_output);
   }
@@ -252,6 +253,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Inferring batch size, number of outputs and number of cells from the
   // input tensors.
   const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE(context, input->dims->size > 1);
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
@@ -296,86 +298,148 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_OK(context,
                     context->ResizeTensor(context, cell_state, cell_size));
 
-  // Create a scratch buffer tensor.
+  // Mark state tensors as persistent tensors.
+  output_state->allocation_type = kTfLiteArenaRwPersistent;
+  cell_state->allocation_type = kTfLiteArenaRwPersistent;
+
+  // The weights are of consistent type, so it suffices to check one.
+  // TODO(mirkov): create a utility/macro for this check, so all Ops can use it.
+  const bool is_hybrid_op = (input_to_output_weights->type == kTfLiteUInt8 &&
+                             input->type == kTfLiteFloat32);
+
   TfLiteIntArrayFree(node->temporaries);
-  node->temporaries = TfLiteIntArrayCreate(1);
+  if (is_hybrid_op) {
+    node->temporaries = TfLiteIntArrayCreate(7);
+  } else {
+    node->temporaries = TfLiteIntArrayCreate(1);
+  }
   node->temporaries->data[0] = op_data->scratch_tensor_index;
+
+  // Create a scratch buffer tensor.
   TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
   scratch_buffer->type = input->type;
   scratch_buffer->allocation_type = kTfLiteArenaRw;
 
-  // Mark state tensors as persistent tensors.
-  output_state->allocation_type = kTfLiteArenaRwPersistent;
-  cell_state->allocation_type = kTfLiteArenaRwPersistent;
-
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
   const bool use_cifg = (input_to_input_weights == nullptr);
+  TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
+  scratch_buffer_size->data[0] = n_batch;
   if (use_cifg) {
-    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-    scratch_buffer_size->data[0] = n_batch;
     // Reserving space for Cell, Forget, Output gates
     scratch_buffer_size->data[1] = n_cell * 3;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                     scratch_buffer_size));
   } else {
-    TfLiteIntArray* scratch_buffer_size = TfLiteIntArrayCreate(2);
-    scratch_buffer_size->data[0] = n_batch;
     // Reserving space for Input, Cell, Forget, Output gates
     scratch_buffer_size->data[1] = n_cell * 4;
-    TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
-                                                     scratch_buffer_size));
+  }
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scratch_buffer,
+                                                   scratch_buffer_size));
+
+  if (is_hybrid_op) {
+    // Allocate temporary tensors to store quantized values of input,
+    // output_state and cell_state tensors.
+    node->temporaries->data[1] = op_data->scratch_tensor_index + 1;
+    TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+    input_quantized->type = kTfLiteUInt8;
+    input_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
+      TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                       input_quantized_size));
+    }
+    node->temporaries->data[2] = op_data->scratch_tensor_index + 2;
+    TfLiteTensor* output_state_quantized =
+        GetTemporary(context, node, /*index=*/2);
+    output_state_quantized->type = kTfLiteUInt8;
+    output_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(output_state_quantized->dims,
+                             output_state->dims)) {
+      TfLiteIntArray* output_state_quantized_size =
+          TfLiteIntArrayCopy(output_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, output_state_quantized,
+                                              output_state_quantized_size));
+    }
+    node->temporaries->data[3] = op_data->scratch_tensor_index + 3;
+    TfLiteTensor* cell_state_quantized =
+        GetTemporary(context, node, /*index=*/3);
+    cell_state_quantized->type = kTfLiteUInt8;
+    cell_state_quantized->allocation_type = kTfLiteArenaRw;
+    if (!TfLiteIntArrayEqual(cell_state_quantized->dims, cell_state->dims)) {
+      TfLiteIntArray* cell_state_quantized_size =
+          TfLiteIntArrayCopy(cell_state->dims);
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, cell_state_quantized,
+                                              cell_state_quantized_size));
+    }
+
+    // Allocate temporary tensors to store scaling factors and product scaling
+    // factors. The latter is a convenience storage which allows to quantize
+    // a vector once (which produces the scaling factors) and multiply it with
+    // different matrices (which requires multiplying the scaling factors with
+    // the scaling factor of the matrix).
+    node->temporaries->data[4] = op_data->scratch_tensor_index + 4;
+    TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+    scaling_factors->type = kTfLiteFloat32;
+    scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+    scaling_factors_size->data[0] = n_batch;
+    if (!TfLiteIntArrayEqual(scaling_factors->dims, scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                       scaling_factors_size));
+    }
+    node->temporaries->data[5] = op_data->scratch_tensor_index + 5;
+    TfLiteTensor* prod_scaling_factors =
+        GetTemporary(context, node, /*index=*/5);
+    prod_scaling_factors->type = kTfLiteFloat32;
+    prod_scaling_factors->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* prod_scaling_factors_size = TfLiteIntArrayCreate(1);
+    prod_scaling_factors_size->data[0] = n_batch;
+    if (!TfLiteIntArrayEqual(prod_scaling_factors->dims,
+                             prod_scaling_factors_size)) {
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, prod_scaling_factors,
+                                              prod_scaling_factors_size));
+    }
+
+    // Allocate a temporary tensor to store the recovered cell weights. Since
+    // this is used for diagonal matrices, only need to store n_cell values.
+    node->temporaries->data[6] = op_data->scratch_tensor_index + 6;
+    TfLiteTensor* recovered_cell_weights =
+        GetTemporary(context, node, /*index=*/6);
+    recovered_cell_weights->type = kTfLiteFloat32;
+    recovered_cell_weights->allocation_type = kTfLiteArenaRw;
+    TfLiteIntArray* recovered_cell_weights_size = TfLiteIntArrayCreate(1);
+    recovered_cell_weights_size->data[0] = n_cell;
+    if (!TfLiteIntArrayEqual(recovered_cell_weights->dims,
+                             recovered_cell_weights_size)) {
+      TF_LITE_ENSURE_OK(context,
+                        context->ResizeTensor(context, recovered_cell_weights,
+                                              recovered_cell_weights_size));
+    }
   }
   return kTfLiteOk;
 }
 
 // The LSTM Op engine.
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
-  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
-
-  const TfLiteTensor* input_to_input_weights =
-      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  const TfLiteTensor* input_to_forget_weights =
-      GetInput(context, node, kInputToForgetWeightsTensor);
-  const TfLiteTensor* input_to_cell_weights =
-      GetInput(context, node, kInputToCellWeightsTensor);
-  const TfLiteTensor* input_to_output_weights =
-      GetInput(context, node, kInputToOutputWeightsTensor);
-
-  const TfLiteTensor* recurrent_to_input_weights =
-      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  const TfLiteTensor* recurrent_to_forget_weights =
-      GetInput(context, node, kRecurrentToForgetWeightsTensor);
-  const TfLiteTensor* recurrent_to_cell_weights =
-      GetInput(context, node, kRecurrentToCellWeightsTensor);
-  const TfLiteTensor* recurrent_to_output_weights =
-      GetInput(context, node, kRecurrentToOutputWeightsTensor);
-
-  const TfLiteTensor* cell_to_input_weights =
-      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
-  const TfLiteTensor* cell_to_forget_weights =
-      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
-  const TfLiteTensor* cell_to_output_weights =
-      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
-
-  const TfLiteTensor* input_gate_bias =
-      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
-  const TfLiteTensor* forget_gate_bias =
-      GetInput(context, node, kForgetGateBiasTensor);
-  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
-  const TfLiteTensor* output_gate_bias =
-      GetInput(context, node, kOutputGateBiasTensor);
-
-  const TfLiteTensor* projection_weights =
-      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
-  const TfLiteTensor* projection_bias =
-      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
-
-  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
-  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
-  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
-
+TfLiteStatus EvalFloat(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output) {
   const int n_batch = input->dims->data[0];
   const int n_input = input->dims->data[1];
   // n_cell and n_output will be the same size when there is no projection.
@@ -387,9 +451,6 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   const bool use_cifg = (input_to_input_weights == nullptr);
   const bool use_peephole = (cell_to_output_weights != nullptr);
 
-  // Index the scratch buffers pointers to the global scratch buffer.
-  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
-
   float* input_gate_scratch = nullptr;
   float* cell_scratch = nullptr;
   float* forget_gate_scratch = nullptr;
@@ -457,6 +518,259 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalHybrid(
+    const TfLiteTensor* input, const TfLiteTensor* input_to_input_weights,
+    const TfLiteTensor* input_to_forget_weights,
+    const TfLiteTensor* input_to_cell_weights,
+    const TfLiteTensor* input_to_output_weights,
+    const TfLiteTensor* recurrent_to_input_weights,
+    const TfLiteTensor* recurrent_to_forget_weights,
+    const TfLiteTensor* recurrent_to_cell_weights,
+    const TfLiteTensor* recurrent_to_output_weights,
+    const TfLiteTensor* cell_to_input_weights,
+    const TfLiteTensor* cell_to_forget_weights,
+    const TfLiteTensor* cell_to_output_weights,
+    const TfLiteTensor* input_gate_bias, const TfLiteTensor* forget_gate_bias,
+    const TfLiteTensor* cell_bias, const TfLiteTensor* output_gate_bias,
+    const TfLiteTensor* projection_weights, const TfLiteTensor* projection_bias,
+    const TfLiteLSTMParams* params, TfLiteTensor* scratch_buffer,
+    TfLiteTensor* scaling_factors, TfLiteTensor* prod_scaling_factors,
+    TfLiteTensor* recovered_cell_weights, TfLiteTensor* input_quantized,
+    TfLiteTensor* output_state_quantized, TfLiteTensor* cell_state_quantized,
+    TfLiteTensor* output_state, TfLiteTensor* cell_state,
+    TfLiteTensor* output) {
+  const int n_batch = input->dims->data[0];
+  const int n_input = input->dims->data[1];
+  // n_cell and n_output will be the same size when there is no projection.
+  const int n_cell = input_to_output_weights->dims->data[0];
+  const int n_output = recurrent_to_output_weights->dims->data[1];
+
+  // Since we have already checked that weights are all there or none, we can
+  // check the existence of only one to get the condition.
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  const bool use_peephole = (cell_to_output_weights != nullptr);
+
+  float* input_gate_scratch = nullptr;
+  float* cell_scratch = nullptr;
+  float* forget_gate_scratch = nullptr;
+  float* output_gate_scratch = nullptr;
+  if (use_cifg) {
+    cell_scratch = scratch_buffer->data.f;
+    forget_gate_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+  } else {
+    input_gate_scratch = scratch_buffer->data.f;
+    cell_scratch = scratch_buffer->data.f + n_cell * n_batch;
+    forget_gate_scratch = scratch_buffer->data.f + 2 * n_cell * n_batch;
+    output_gate_scratch = scratch_buffer->data.f + 3 * n_cell * n_batch;
+  }
+
+  // Check optional tensors, the respective pointers can be null.
+  int8_t* input_to_input_weights_ptr = nullptr;
+  float input_to_input_weights_scale = 1.0f;
+  int8_t* recurrent_to_input_weights_ptr = nullptr;
+  float recurrent_to_input_weights_scale = 1.0f;
+  float* input_gate_bias_ptr = nullptr;
+  if (!use_cifg) {
+    input_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(input_to_input_weights->data.uint8);
+    recurrent_to_input_weights_ptr =
+        reinterpret_cast<int8_t*>(recurrent_to_input_weights->data.uint8);
+    input_gate_bias_ptr = input_gate_bias->data.f;
+    input_to_input_weights_scale = input_to_input_weights->params.scale;
+    recurrent_to_input_weights_scale = recurrent_to_input_weights->params.scale;
+  }
+
+  int8_t* cell_to_input_weights_ptr = nullptr;
+  int8_t* cell_to_forget_weights_ptr = nullptr;
+  int8_t* cell_to_output_weights_ptr = nullptr;
+  float cell_to_input_weights_scale = 1.0f;
+  float cell_to_forget_weights_scale = 1.0f;
+  float cell_to_output_weights_scale = 1.0f;
+  if (use_peephole) {
+    if (!use_cifg) {
+      cell_to_input_weights_ptr =
+          reinterpret_cast<int8_t*>(cell_to_input_weights->data.uint8);
+      cell_to_input_weights_scale = cell_to_input_weights->params.scale;
+    }
+    cell_to_forget_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_forget_weights->data.uint8);
+    cell_to_output_weights_ptr =
+        reinterpret_cast<int8_t*>(cell_to_output_weights->data.uint8);
+    cell_to_forget_weights_scale = cell_to_forget_weights->params.scale;
+    cell_to_output_weights_scale = cell_to_output_weights->params.scale;
+  }
+
+  const int8_t* projection_weights_ptr =
+      (projection_weights == nullptr)
+          ? nullptr
+          : reinterpret_cast<int8_t*>(projection_weights->data.uint8);
+  const float projection_weights_scale =
+      (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
+  const float* projection_bias_ptr =
+      (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+
+  // Required tensors, pointers are non-null.
+  const float* input_ptr_batch = input->data.f;
+  const int8_t* input_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_forget_weights->data.uint8);
+  const float input_to_forget_weights_scale =
+      input_to_forget_weights->params.scale;
+  const int8_t* input_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_cell_weights->data.uint8);
+  const float input_to_cell_weights_scale = input_to_cell_weights->params.scale;
+  const int8_t* input_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(input_to_output_weights->data.uint8);
+  const float input_to_output_weights_scale =
+      input_to_output_weights->params.scale;
+  const int8_t* recurrent_to_forget_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_forget_weights->data.uint8);
+  const float recurrent_to_forget_weights_scale =
+      recurrent_to_forget_weights->params.scale;
+  const int8_t* recurrent_to_cell_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_cell_weights->data.uint8);
+  const float recurrent_to_cell_weights_scale =
+      recurrent_to_cell_weights->params.scale;
+  const int8_t* recurrent_to_output_weights_ptr =
+      reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
+  const float recurrent_to_output_weights_scale =
+      recurrent_to_output_weights->params.scale;
+  const float* forget_gate_bias_ptr = forget_gate_bias->data.f;
+  const float* cell_bias_ptr = cell_bias->data.f;
+  const float* output_gate_bias_ptr = output_gate_bias->data.f;
+
+  float* output_state_ptr = output_state->data.f;
+  float* cell_state_ptr = cell_state->data.f;
+  float* output_ptr_batch = output->data.f;
+
+  // Temporary storage for quantized values and scaling factors.
+  int8_t* quantized_input_ptr =
+      reinterpret_cast<int8_t*>(input_quantized->data.uint8);
+  int8_t* quantized_output_state_ptr =
+      reinterpret_cast<int8_t*>(output_state_quantized->data.uint8);
+  int8_t* quantized_cell_state_ptr =
+      reinterpret_cast<int8_t*>(cell_state_quantized->data.uint8);
+  float* scaling_factors_ptr = scaling_factors->data.f;
+  float* prod_scaling_factors_ptr = prod_scaling_factors->data.f;
+  float* recovered_cell_weights_ptr = recovered_cell_weights->data.f;
+
+  kernel_utils::LstmStep(
+      input_ptr_batch, input_to_input_weights_ptr, input_to_input_weights_scale,
+      input_to_forget_weights_ptr, input_to_forget_weights_scale,
+      input_to_cell_weights_ptr, input_to_cell_weights_scale,
+      input_to_output_weights_ptr, input_to_output_weights_scale,
+      recurrent_to_input_weights_ptr, recurrent_to_input_weights_scale,
+      recurrent_to_forget_weights_ptr, recurrent_to_forget_weights_scale,
+      recurrent_to_cell_weights_ptr, recurrent_to_cell_weights_scale,
+      recurrent_to_output_weights_ptr, recurrent_to_output_weights_scale,
+      cell_to_input_weights_ptr, cell_to_input_weights_scale,
+      cell_to_forget_weights_ptr, cell_to_forget_weights_scale,
+      cell_to_output_weights_ptr, cell_to_output_weights_scale,
+      input_gate_bias_ptr, forget_gate_bias_ptr, cell_bias_ptr,
+      output_gate_bias_ptr, projection_weights_ptr, projection_weights_scale,
+      projection_bias_ptr, params, n_batch, n_cell, n_input, n_output,
+      input_gate_scratch, forget_gate_scratch, cell_scratch,
+      output_gate_scratch, scaling_factors_ptr, prod_scaling_factors_ptr,
+      recovered_cell_weights_ptr, quantized_input_ptr,
+      quantized_output_state_ptr, quantized_cell_state_ptr, output_state_ptr,
+      cell_state_ptr, output_ptr_batch);
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const auto* params = reinterpret_cast<TfLiteLSTMParams*>(node->builtin_data);
+  const TfLiteTensor* input = GetInput(context, node, kInputTensor);
+
+  const TfLiteTensor* input_to_input_weights =
+      GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
+  const TfLiteTensor* input_to_forget_weights =
+      GetInput(context, node, kInputToForgetWeightsTensor);
+  const TfLiteTensor* input_to_cell_weights =
+      GetInput(context, node, kInputToCellWeightsTensor);
+  const TfLiteTensor* input_to_output_weights =
+      GetInput(context, node, kInputToOutputWeightsTensor);
+
+  const TfLiteTensor* recurrent_to_input_weights =
+      GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
+  const TfLiteTensor* recurrent_to_forget_weights =
+      GetInput(context, node, kRecurrentToForgetWeightsTensor);
+  const TfLiteTensor* recurrent_to_cell_weights =
+      GetInput(context, node, kRecurrentToCellWeightsTensor);
+  const TfLiteTensor* recurrent_to_output_weights =
+      GetInput(context, node, kRecurrentToOutputWeightsTensor);
+
+  const TfLiteTensor* cell_to_input_weights =
+      GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
+  const TfLiteTensor* cell_to_forget_weights =
+      GetOptionalInputTensor(context, node, kCellToForgetWeightsTensor);
+  const TfLiteTensor* cell_to_output_weights =
+      GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
+
+  const TfLiteTensor* input_gate_bias =
+      GetOptionalInputTensor(context, node, kInputGateBiasTensor);
+  const TfLiteTensor* forget_gate_bias =
+      GetInput(context, node, kForgetGateBiasTensor);
+  const TfLiteTensor* cell_bias = GetInput(context, node, kCellGateBiasTensor);
+  const TfLiteTensor* output_gate_bias =
+      GetInput(context, node, kOutputGateBiasTensor);
+
+  const TfLiteTensor* projection_weights =
+      GetOptionalInputTensor(context, node, kProjectionWeightsTensor);
+  const TfLiteTensor* projection_bias =
+      GetOptionalInputTensor(context, node, kProjectionBiasTensor);
+
+  // Index the scratch buffers pointers to the global scratch buffer.
+  TfLiteTensor* scratch_buffer = GetTemporary(context, node, /*index=*/0);
+
+  TfLiteTensor* output_state = GetOutput(context, node, kOutputStateTensor);
+  TfLiteTensor* cell_state = GetOutput(context, node, kCellStateTensor);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  // TODO(mirkov): add a check that weights are all uint8s or all floats.
+  switch (input_to_output_weights->type) {
+    case kTfLiteFloat32: {
+      return EvalFloat(input, input_to_input_weights, input_to_forget_weights,
+                       input_to_cell_weights, input_to_output_weights,
+                       recurrent_to_input_weights, recurrent_to_forget_weights,
+                       recurrent_to_cell_weights, recurrent_to_output_weights,
+                       cell_to_input_weights, cell_to_forget_weights,
+                       cell_to_output_weights, input_gate_bias,
+                       forget_gate_bias, cell_bias, output_gate_bias,
+                       projection_weights, projection_bias, params,
+                       scratch_buffer, output_state, cell_state, output);
+    }
+    case kTfLiteUInt8: {
+      TfLiteTensor* input_quantized = GetTemporary(context, node, /*index=*/1);
+      TfLiteTensor* output_state_quantized =
+          GetTemporary(context, node, /*index=*/2);
+      TfLiteTensor* cell_state_quantized =
+          GetTemporary(context, node, /*index=*/3);
+      TfLiteTensor* scaling_factors = GetTemporary(context, node, /*index=*/4);
+      TfLiteTensor* prod_scaling_factors =
+          GetTemporary(context, node, /*index=*/5);
+      TfLiteTensor* recovered_cell_weights =
+          GetTemporary(context, node, /*index=*/6);
+      return EvalHybrid(
+          input, input_to_input_weights, input_to_forget_weights,
+          input_to_cell_weights, input_to_output_weights,
+          recurrent_to_input_weights, recurrent_to_forget_weights,
+          recurrent_to_cell_weights, recurrent_to_output_weights,
+          cell_to_input_weights, cell_to_forget_weights, cell_to_output_weights,
+          input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias,
+          projection_weights, projection_bias, params, scratch_buffer,
+          scaling_factors, prod_scaling_factors, recovered_cell_weights,
+          input_quantized, output_state_quantized, cell_state_quantized,
+          output_state, cell_state, output);
+    }
+    default:
+      context->ReportError(context, "Type %d is not currently supported.",
+                           input_to_output_weights->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 }  // namespace full
 
 // For basic kernel (5-inputs).
@@ -491,7 +805,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, node->inputs->size == kInputNum);
   TF_LITE_ENSURE(context, node->outputs->size == kOutputNum);
 
-  // Only Float32 is supportted currently.
+  // Only Float32 is supported currently.
   // TODO(ycling): Implement quantize uint8 support.
   for (int index = 0; index < node->inputs->size; ++index) {
     TfLiteTensor* tensor = &context->tensors[node->inputs->data[index]];
diff --git a/tensorflow/contrib/lite/kernels/lstm_test.cc b/tensorflow/contrib/lite/kernels/lstm_test.cc
index d81220d8d3..6da29a4a92 100644
--- a/tensorflow/contrib/lite/kernels/lstm_test.cc
+++ b/tensorflow/contrib/lite/kernels/lstm_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 // Unit test for TFLite LSTM op.
 
-#include <iomanip>
 #include <memory>
 #include <vector>
 
@@ -35,7 +34,8 @@ class LSTMOpModel : public SingleOpModel {
   LSTMOpModel(int n_batch, int n_input, int n_cell, int n_output, bool use_cifg,
               bool use_peephole, bool use_projection_weights,
               bool use_projection_bias, float cell_clip, float proj_clip,
-              const std::vector<std::vector<int>>& input_shapes)
+              const std::vector<std::vector<int>>& input_shapes,
+              const TensorType& weight_type = TensorType_FLOAT32)
       : n_batch_(n_batch),
         n_input_(n_input),
         n_cell_(n_cell),
@@ -45,31 +45,31 @@ class LSTMOpModel : public SingleOpModel {
     if (use_cifg) {
       input_to_input_weights_ = AddNullInput();
     } else {
-      input_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      input_to_input_weights_ = AddInput(weight_type);
     }
 
-    input_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    input_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    input_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    input_to_forget_weights_ = AddInput(weight_type);
+    input_to_cell_weights_ = AddInput(weight_type);
+    input_to_output_weights_ = AddInput(weight_type);
 
     if (use_cifg) {
       recurrent_to_input_weights_ = AddNullInput();
     } else {
-      recurrent_to_input_weights_ = AddInput(TensorType_FLOAT32);
+      recurrent_to_input_weights_ = AddInput(weight_type);
     }
 
-    recurrent_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_to_cell_weights_ = AddInput(TensorType_FLOAT32);
-    recurrent_to_output_weights_ = AddInput(TensorType_FLOAT32);
+    recurrent_to_forget_weights_ = AddInput(weight_type);
+    recurrent_to_cell_weights_ = AddInput(weight_type);
+    recurrent_to_output_weights_ = AddInput(weight_type);
 
     if (use_peephole) {
       if (use_cifg) {
         cell_to_input_weights_ = AddNullInput();
       } else {
-        cell_to_input_weights_ = AddInput(TensorType_FLOAT32);
+        cell_to_input_weights_ = AddInput(weight_type);
       }
-      cell_to_forget_weights_ = AddInput(TensorType_FLOAT32);
-      cell_to_output_weights_ = AddInput(TensorType_FLOAT32);
+      cell_to_forget_weights_ = AddInput(weight_type);
+      cell_to_output_weights_ = AddInput(weight_type);
     } else {
       cell_to_input_weights_ = AddNullInput();
       cell_to_forget_weights_ = AddNullInput();
@@ -86,7 +86,7 @@ class LSTMOpModel : public SingleOpModel {
     output_gate_bias_ = AddInput(TensorType_FLOAT32);
 
     if (use_projection_weights) {
-      projection_weights_ = AddInput(TensorType_FLOAT32);
+      projection_weights_ = AddInput(weight_type);
       if (use_projection_bias) {
         projection_bias_ = AddInput(TensorType_FLOAT32);
       } else {
@@ -192,8 +192,9 @@ class LSTMOpModel : public SingleOpModel {
                    zero_buffer.get() + zero_buffer_size);
   }
 
-  void SetInput(int offset, float* begin, float* end) {
-    PopulateTensor(input_, offset, begin, end);
+  void SetInput(int offset, const float* begin, const float* end) {
+    PopulateTensor(input_, offset, const_cast<float*>(begin),
+                   const_cast<float*>(end));
   }
 
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
@@ -203,7 +204,7 @@ class LSTMOpModel : public SingleOpModel {
   int num_cells() { return n_cell_; }
   int num_batches() { return n_batch_; }
 
- private:
+ protected:
   int input_;
   int input_to_input_weights_;
   int input_to_forget_weights_;
@@ -237,7 +238,182 @@ class LSTMOpModel : public SingleOpModel {
   int n_output_;
 };
 
-TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
+class HybridLSTMOpModel : public LSTMOpModel {
+ public:
+  HybridLSTMOpModel(int n_batch, int n_input, int n_cell, int n_output,
+                    bool use_cifg, bool use_peephole,
+                    bool use_projection_weights, bool use_projection_bias,
+                    float cell_clip, float proj_clip,
+                    const std::vector<std::vector<int>>& input_shapes)
+      : LSTMOpModel(n_batch, n_input, n_cell, n_output, use_cifg, use_peephole,
+                    use_projection_weights, use_projection_bias, cell_clip,
+                    proj_clip, input_shapes, TensorType_UINT8) {}
+
+  void SetInputToInputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_input_weights_, f);
+  }
+
+  void SetInputToForgetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_forget_weights_, f);
+  }
+
+  void SetInputToCellWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_cell_weights_, f);
+  }
+
+  void SetInputToOutputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(input_to_output_weights_, f);
+  }
+
+  void SetRecurrentToInputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_input_weights_, f);
+  }
+
+  void SetRecurrentToForgetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_forget_weights_, f);
+  }
+
+  void SetRecurrentToCellWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_cell_weights_, f);
+  }
+
+  void SetRecurrentToOutputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(recurrent_to_output_weights_, f);
+  }
+
+  void SetCellToInputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_input_weights_, f);
+  }
+
+  void SetCellToForgetWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_forget_weights_, f);
+  }
+
+  void SetCellToOutputWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(cell_to_output_weights_, f);
+  }
+
+  void SetProjectionWeights(std::initializer_list<float> f) {
+    SymmetricQuantizeAndPopulate(projection_weights_, f);
+  }
+};
+
+class BaseLstmTest : public ::testing::Test {
+ protected:
+  // Weights of the LSTM model. Some are optional.
+  std::initializer_list<float> input_to_input_weights_;
+  std::initializer_list<float> input_to_cell_weights_;
+  std::initializer_list<float> input_to_forget_weights_;
+  std::initializer_list<float> input_to_output_weights_;
+  std::initializer_list<float> input_gate_bias_;
+  std::initializer_list<float> cell_gate_bias_;
+  std::initializer_list<float> forget_gate_bias_;
+  std::initializer_list<float> output_gate_bias_;
+  std::initializer_list<float> recurrent_to_input_weights_;
+  std::initializer_list<float> recurrent_to_cell_weights_;
+  std::initializer_list<float> recurrent_to_forget_weights_;
+  std::initializer_list<float> recurrent_to_output_weights_;
+  std::initializer_list<float> cell_to_input_weights_;
+  std::initializer_list<float> cell_to_forget_weights_;
+  std::initializer_list<float> cell_to_output_weights_;
+  std::initializer_list<float> projection_weights_;
+
+  // LSTM input is stored as num_batch x num_inputs vector.
+  std::vector<std::vector<float>> lstm_input_;
+  // LSTM output is stored as num_batch x num_outputs vector.
+  std::vector<std::vector<float>> lstm_golden_output_;
+
+  // Compares output up to tolerance to the result of the lstm given the input.
+  void VerifyGoldens(const std::vector<std::vector<float>>& input,
+                     const std::vector<std::vector<float>>& output,
+                     LSTMOpModel* lstm, float tolerance = 1e-5) {
+    const int num_batches = input.size();
+    EXPECT_GT(num_batches, 0);
+    const int num_inputs = lstm->num_inputs();
+    EXPECT_GT(num_inputs, 0);
+    const int input_sequence_size = input[0].size() / num_inputs;
+    EXPECT_GT(input_sequence_size, 0);
+    for (int i = 0; i < input_sequence_size; ++i) {
+      for (int b = 0; b < num_batches; ++b) {
+        const float* batch_start = input[b].data() + i * num_inputs;
+        const float* batch_end = batch_start + num_inputs;
+
+        lstm->SetInput(b * lstm->num_inputs(), batch_start, batch_end);
+      }
+
+      lstm->Invoke();
+
+      const int num_outputs = lstm->num_outputs();
+      std::vector<float> expected;
+      for (int b = 0; b < num_batches; ++b) {
+        const float* golden_start_batch = output[b].data() + i * num_outputs;
+        const float* golden_end_batch = golden_start_batch + num_outputs;
+        expected.insert(expected.end(), golden_start_batch, golden_end_batch);
+      }
+      EXPECT_THAT(lstm->GetOutput(),
+                  ElementsAreArray(ArrayFloatNear(expected, tolerance)));
+      for (int i = 0; i < num_outputs; ++i) {
+        std::cout << lstm->GetOutput()[i] << ", ";
+      }
+      std::cout << std::endl;
+      for (int i = 0; i < num_outputs; ++i) {
+        std::cout << expected[i] << ", ";
+      }
+      std::cout << std::endl;
+    }
+  }
+};
+
+class NoCifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {-0.45018822, -0.02338299, -0.0870589,
+                               -0.34550029, 0.04266912,  -0.15680569,
+                               -0.34856534, 0.43890524};
+    input_to_cell_weights_ = {-0.50013041, 0.1370284,  0.11810488, 0.2013163,
+                              -0.20583314, 0.44344562, 0.22077113, -0.29909778};
+    input_to_forget_weights_ = {0.09701663,  0.20334584,  -0.50592935,
+                                -0.31343272, -0.40032279, 0.44781327,
+                                0.01387155,  -0.35593212};
+    input_to_output_weights_ = {-0.25065863, -0.28290087, 0.04613829,
+                                0.40525138,  0.44272184,  0.03897077,
+                                -0.1556896,  0.19487578};
+    input_gate_bias_ = {0., 0., 0., 0.};
+    cell_gate_bias_ = {0., 0., 0., 0.};
+    forget_gate_bias_ = {1., 1., 1., 1.};
+    output_gate_bias_ = {0., 0., 0., 0.};
+
+    recurrent_to_input_weights_ = {
+        -0.0063535,  -0.2042388,  0.31454784,  -0.35746509,
+        0.28902304,  0.08183324,  -0.16555229, 0.02286911,
+        -0.13566875, 0.03034258,  0.48091322,  -0.12528998,
+        0.24077177,  -0.51332325, -0.33502164, 0.10629296};
+
+    recurrent_to_cell_weights_ = {
+        -0.3407414,  0.24443203,  -0.2078532,  0.26320225,
+        0.05695659,  -0.00123841, -0.4744786,  -0.35869038,
+        -0.06418842, -0.13502428, -0.501764,   0.22830659,
+        -0.46367589, 0.26016325,  -0.03894562, -0.16368064};
+
+    recurrent_to_forget_weights_ = {
+        -0.48684245, -0.06655136, 0.42224967,  0.2112639,
+        0.27654213,  0.20864892,  -0.07646349, 0.45877004,
+        0.00141793,  -0.14609534, 0.36447752,  0.09196436,
+        0.28053468,  0.01560611,  -0.20127171, -0.01140004};
+
+    recurrent_to_output_weights_ = {
+        0.43385774,  -0.17194885, 0.2718237,  0.09215671,
+        0.24107647,  -0.39835793, 0.18212086, 0.01301402,
+        0.48572797,  -0.50656658, 0.20047462, -0.20607421,
+        -0.51818722, -0.15390486, 0.0468148,  0.39922136};
+
+    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
+    lstm_golden_output_ = {{-0.02973187, 0.1229473, 0.20885126, -0.15358765,
+                            -0.03716109, 0.12507336, 0.41193449, -0.20860538,
+                            -0.15053082, 0.09120187, 0.24278517, -0.12222792}};
+  }
+};
+
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -257,10 +433,10 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
                        {n_cell, n_input},  // input_to_cell_weight tensor
                        {n_cell, n_input},  // input_to_output_weight tensor
 
-                       {n_cell, n_output},  // recurrent_to_input_weight tensor
-                       {n_cell, n_output},  // recurrent_to_forget_weight tensor
-                       {n_cell, n_output},  // recurrent_to_cell_weight tensor
-                       {n_cell, n_output},  // recurrent_to_output_weight tensor
+                       {n_cell, n_output},  // recurrent_to_input_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_forget_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_cell_weight_tensor
+                       {n_cell, n_output},  // recurrent_to_output_weight_tensor
 
                        {0},  // cell_to_input_weight tensor
                        {0},  // cell_to_forget_weight tensor
@@ -275,79 +451,137 @@ TEST(LSTMOpTest, BlackBoxTestNoCifgNoPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-  lstm.SetInputToInputWeights({-0.45018822, -0.02338299, -0.0870589,
-                               -0.34550029, 0.04266912, -0.15680569,
-                               -0.34856534, 0.43890524});
-
-  lstm.SetInputToCellWeights({-0.50013041, 0.1370284, 0.11810488, 0.2013163,
-                              -0.20583314, 0.44344562, 0.22077113,
-                              -0.29909778});
-
-  lstm.SetInputToForgetWeights({0.09701663, 0.20334584, -0.50592935,
-                                -0.31343272, -0.40032279, 0.44781327,
-                                0.01387155, -0.35593212});
-
-  lstm.SetInputToOutputWeights({-0.25065863, -0.28290087, 0.04613829,
-                                0.40525138, 0.44272184, 0.03897077, -0.1556896,
-                                0.19487578});
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
 
-  lstm.SetInputGateBias({0., 0., 0., 0.});
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
 
-  lstm.SetCellBias({0., 0., 0., 0.});
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
-  lstm.SetForgetGateBias({1., 1., 1., 1.});
-
-  lstm.SetOutputGateBias({0., 0., 0., 0.});
-
-  lstm.SetRecurrentToInputWeights(
-      {-0.0063535, -0.2042388, 0.31454784, -0.35746509, 0.28902304, 0.08183324,
-       -0.16555229, 0.02286911, -0.13566875, 0.03034258, 0.48091322,
-       -0.12528998, 0.24077177, -0.51332325, -0.33502164, 0.10629296});
-
-  lstm.SetRecurrentToCellWeights(
-      {-0.3407414, 0.24443203, -0.2078532, 0.26320225, 0.05695659, -0.00123841,
-       -0.4744786, -0.35869038, -0.06418842, -0.13502428, -0.501764, 0.22830659,
-       -0.46367589, 0.26016325, -0.03894562, -0.16368064});
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
 
-  lstm.SetRecurrentToForgetWeights(
-      {-0.48684245, -0.06655136, 0.42224967, 0.2112639, 0.27654213, 0.20864892,
-       -0.07646349, 0.45877004, 0.00141793, -0.14609534, 0.36447752, 0.09196436,
-       0.28053468, 0.01560611, -0.20127171, -0.01140004});
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
 
-  lstm.SetRecurrentToOutputWeights(
-      {0.43385774, -0.17194885, 0.2718237, 0.09215671, 0.24107647, -0.39835793,
-       0.18212086, 0.01301402, 0.48572797, -0.50656658, 0.20047462, -0.20607421,
-       -0.51818722, -0.15390486, 0.0468148, 0.39922136});
+TEST_F(NoCifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
 
-  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
-  static float lstm_golden_output[] = {-0.02973187, 0.1229473,   0.20885126,
-                                       -0.15358765, -0.03716109, 0.12507336,
-                                       0.41193449,  -0.20860538, -0.15053082,
-                                       0.09120187,  0.24278517,  -0.12222792};
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/false,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false, /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},  // cell_to_input_weight tensor
+          {0},  // cell_to_forget_weight tensor
+          {0},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
   // Resetting cell_state and output_state
   lstm.ResetCellState();
   lstm.ResetOutputState();
 
-  const int input_sequence_size =
-      sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
-  for (int i = 0; i < input_sequence_size; i++) {
-    float* batch0_start = lstm_input + i * lstm.num_inputs();
-    float* batch0_end = batch0_start + lstm.num_inputs();
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm,
+                /*tolerance=*/0.0157651);
+}
 
-    lstm.SetInput(0, batch0_start, batch0_end);
+class CifgNoPeepholeNoProjectionNoClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_cell_weights_ = {-0.49770179, -0.27711356, -0.09624726,
+                              0.05100781,  0.04717243,  0.48944736,
+                              -0.38535351, -0.17212132};
 
-    lstm.Invoke();
+    input_to_forget_weights_ = {-0.55291498, -0.42866567, 0.13056988,
+                                -0.3633365,  -0.22755712, 0.28253698,
+                                0.24407166,  0.33826375};
 
-    float* golden_start = lstm_golden_output + i * lstm.num_outputs();
-    float* golden_end = golden_start + lstm.num_outputs();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start, golden_end);
-    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+    input_to_output_weights_ = {0.10725588,  -0.02335852, -0.55932593,
+                                -0.09426838, -0.44257352, 0.54939759,
+                                0.01533556,  0.42751634};
+    cell_gate_bias_ = {0., 0., 0., 0.};
+    forget_gate_bias_ = {1., 1., 1., 1.};
+    output_gate_bias_ = {0., 0., 0., 0.};
+
+    recurrent_to_cell_weights_ = {
+        0.54066205,  -0.32668582, -0.43562764, -0.56094903,
+        0.42957711,  0.01841056,  -0.32764608, -0.33027974,
+        -0.10826075, 0.20675004,  0.19069612,  -0.03026325,
+        -0.54532051, 0.33003211,  0.44901288,  0.21193194};
+
+    recurrent_to_forget_weights_ = {
+        -0.13832897, -0.0515101,  -0.2359007, -0.16661474,
+        -0.14340827, 0.36986142,  0.23414481, 0.55899,
+        0.10798943,  -0.41174671, 0.17751795, -0.34484994,
+        -0.35874045, -0.11352962, 0.27268326, 0.54058349};
+
+    recurrent_to_output_weights_ = {
+        0.41613156, 0.42610586,  -0.16495961, -0.5663873,
+        0.30579174, -0.05115908, -0.33941799, 0.23364776,
+        0.11178309, 0.09481031,  -0.26424935, 0.46261835,
+        0.50248802, 0.26114327,  -0.43736315, 0.33149987};
+
+    cell_to_forget_weights_ = {0.47485286, -0.51955009, -0.24458408,
+                               0.31544167};
+    cell_to_output_weights_ = {-0.17135078, 0.82760304, 0.85573703,
+                               -0.77109635};
+
+    lstm_input_ = {{2., 3., 3., 4., 1., 1.}};
+    lstm_golden_output_ = {{-0.36444446, -0.00352185, 0.12886585, -0.05163646,
+                            -0.42312205, -0.01218222, 0.24201041, -0.08124574,
+                            -0.358325, -0.04621704, 0.21641694, -0.06471302}};
   }
-}
+};
 
-TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 1;
   const int n_input = 2;
   // n_cell and n_output have the same size when there is no projection.
@@ -385,74 +619,689 @@ TEST(LSTMOpTest, BlackBoxTestWithCifgWithPeepholeNoProjectionNoClipping) {
                        {0},     // projection_bias tensor
                    });
 
-  lstm.SetInputToCellWeights({-0.49770179, -0.27711356, -0.09624726, 0.05100781,
-                              0.04717243, 0.48944736, -0.38535351,
-                              -0.17212132});
-
-  lstm.SetInputToForgetWeights({-0.55291498, -0.42866567, 0.13056988,
-                                -0.3633365, -0.22755712, 0.28253698, 0.24407166,
-                                0.33826375});
-
-  lstm.SetInputToOutputWeights({0.10725588, -0.02335852, -0.55932593,
-                                -0.09426838, -0.44257352, 0.54939759,
-                                0.01533556, 0.42751634});
-
-  lstm.SetCellBias({0., 0., 0., 0.});
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
 
-  lstm.SetForgetGateBias({1., 1., 1., 1.});
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
 
-  lstm.SetOutputGateBias({0., 0., 0., 0.});
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
 
-  lstm.SetRecurrentToCellWeights(
-      {0.54066205, -0.32668582, -0.43562764, -0.56094903, 0.42957711,
-       0.01841056, -0.32764608, -0.33027974, -0.10826075, 0.20675004,
-       0.19069612, -0.03026325, -0.54532051, 0.33003211, 0.44901288,
-       0.21193194});
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
 
-  lstm.SetRecurrentToForgetWeights(
-      {-0.13832897, -0.0515101, -0.2359007, -0.16661474, -0.14340827,
-       0.36986142, 0.23414481, 0.55899, 0.10798943, -0.41174671, 0.17751795,
-       -0.34484994, -0.35874045, -0.11352962, 0.27268326, 0.54058349});
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
 
-  lstm.SetRecurrentToOutputWeights(
-      {0.41613156, 0.42610586, -0.16495961, -0.5663873, 0.30579174, -0.05115908,
-       -0.33941799, 0.23364776, 0.11178309, 0.09481031, -0.26424935, 0.46261835,
-       0.50248802, 0.26114327, -0.43736315, 0.33149987});
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
 
-  lstm.SetCellToForgetWeights(
-      {0.47485286, -0.51955009, -0.24458408, 0.31544167});
-  lstm.SetCellToOutputWeights(
-      {-0.17135078, 0.82760304, 0.85573703, -0.77109635});
+TEST_F(CifgNoPeepholeNoProjectionNoClippingLstmTest, HybridLstmBlackBoxTest) {
+  const int n_batch = 1;
+  const int n_input = 2;
+  // n_cell and n_output have the same size when there is no projection.
+  const int n_cell = 4;
+  const int n_output = 4;
 
-  static float lstm_input[] = {2., 3., 3., 4., 1., 1.};
-  static float lstm_golden_output[] = {-0.36444446, -0.00352185, 0.12886585,
-                                       -0.05163646, -0.42312205, -0.01218222,
-                                       0.24201041,  -0.08124574, -0.358325,
-                                       -0.04621704, 0.21641694,  -0.06471302};
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/false,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {0, 0},  // projection_weight tensor
+          {0},     // projection_bias tensor
+      });
+
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
 
   // Resetting cell_state and output_state
   lstm.ResetCellState();
   lstm.ResetOutputState();
 
-  const int input_sequence_size =
-      sizeof(lstm_input) / sizeof(float) / (lstm.num_inputs());
-  for (int i = 0; i < input_sequence_size; i++) {
-    float* batch0_start = lstm_input + i * lstm.num_inputs();
-    float* batch0_end = batch0_start + lstm.num_inputs();
-
-    lstm.SetInput(0, batch0_start, batch0_end);
-
-    lstm.Invoke();
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.03573);
+}
 
-    float* golden_start = lstm_golden_output + i * lstm.num_outputs();
-    float* golden_end = golden_start + lstm.num_outputs();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start, golden_end);
-    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
+class NoCifgPeepholeProjectionClippingLstmTest : public BaseLstmTest {
+  void SetUp() override {
+    input_to_input_weights_ = {
+        0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
+        0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
+        -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
+        -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
+        -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
+        -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
+        -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
+        0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
+        0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
+        0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
+        -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
+        0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
+        -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
+        -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
+        -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
+        0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
+        -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
+        -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
+        -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
+        -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677};
+
+    input_to_forget_weights_ = {
+        -0.0018401089, -0.004852237, 0.03698424,    0.014181704,
+        0.028273236,   -0.016726194, -0.05249759,   -0.10204261,
+        0.00861066,    -0.040979505, -0.009899187,  0.01923892,
+        -0.028177269,  -0.08535103,  -0.14585495,   0.10662567,
+        -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
+        0.0030784295,  0.076784775,  0.07463696,    0.094531395,
+        0.0814421,     -0.12257899,  -0.033945758,  -0.031303465,
+        0.045630626,   0.06843887,   -0.13492945,   -0.012480007,
+        -0.0811829,    -0.07224499,  -0.09628791,   0.045100946,
+        0.0012300825,  0.013964662,  0.099372394,   0.02543059,
+        0.06958324,    0.034257296,  0.0482646,     0.06267997,
+        0.052625068,   0.12784666,   0.07077897,    0.025725935,
+        0.04165009,    0.07241905,   0.018668644,   -0.037377294,
+        -0.06277783,   -0.08833636,  -0.040120605,  -0.011405586,
+        -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
+        0.05483423,    0.11449111,   0.11289652,    0.10939839,
+        0.13396506,    -0.08402166,  -0.01901462,   -0.044678304,
+        -0.07720565,   0.014350063,  -0.11757958,   -0.0652038,
+        -0.08185733,   -0.076754324, -0.092614375,  0.10405491,
+        0.052960336,   0.035755895,  0.035839386,   -0.012540553,
+        0.036881298,   0.02913376,   0.03420159,    0.05448447,
+        -0.054523353,  0.02582715,   0.02327355,    -0.011857179,
+        -0.0011980024, -0.034641717, -0.026125094,  -0.17582615,
+        -0.15923657,   -0.27486774,  -0.0006143371, 0.0001771948,
+        -8.470171e-05, 0.02651807,   0.045790765,   0.06956496};
+
+    input_to_cell_weights_ = {
+        -0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
+        -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
+        -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
+        -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
+        -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
+        0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
+        -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
+        0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
+        -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
+        -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
+        -0.025174323,  0.0396852,     0.081777506,   0.06157468,
+        0.10210095,    -0.009658194,  0.046511717,   0.03603906,
+        0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
+        0.053568836,   0.06408714,    0.12835667,    -0.008714329,
+        -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
+        -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
+        -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
+        -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
+        -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
+        -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
+        0.05453865,    0.091149814,   0.06387331,    0.007518393,
+        0.055960953,   0.069779344,   0.046411168,   0.10509911,
+        0.07463894,    0.0075130584,  0.012850982,   0.04555431,
+        0.056955688,   0.06555285,    0.050801456,   -0.009862683,
+        0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042};
+
+    input_to_output_weights_ = {
+        -0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
+        -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
+        0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
+        -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
+        -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
+        0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
+        -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
+        -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
+        -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
+        -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
+        0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
+        0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
+        0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
+        -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
+        0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
+        0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
+        -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
+        0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
+        -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
+        -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956};
+
+    input_gate_bias_ = {0.02234832,   0.14757581,  0.18176508,  0.10380666,
+                        0.053110216,  -0.06928846, -0.13942584, -0.11816189,
+                        0.19483899,   0.03652339,  -0.10250295, 0.036714908,
+                        -0.18426876,  0.036065217, 0.21810818,  0.02383196,
+                        -0.043370757, 0.08690144,  -0.04444982, 0.00030581196};
+
+    forget_gate_bias_ = {0.035185695, -0.042891346, -0.03032477, 0.23027696,
+                         0.11098921,  0.15378423,   0.09263801,  0.09790885,
+                         0.09508917,  0.061199076,  0.07665568,  -0.015443159,
+                         -0.03499149, 0.046190713,  0.08895977,  0.10899629,
+                         0.40694186,  0.06030037,   0.012413437, -0.06108739};
+
+    cell_gate_bias_ = {-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
+                       -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
+                       -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
+                       -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
+                       0.016178843,  0.1749513,    0.13975595,   0.92058027};
+
+    output_gate_bias_ = {0.046159424, -0.0012809046, 0.03563469,   0.12648113,
+                         0.027195795, 0.35373217,    -0.018957434, 0.008907322,
+                         -0.0762701,  0.12018895,    0.04216877,   0.0022856654,
+                         0.040952638, 0.3147856,     0.08225149,   -0.057416286,
+                         -0.14995944, -0.008040261,  0.13208859,   0.029760877};
+
+    recurrent_to_input_weights_ = {
+        -0.001374326,   -0.078856036,   0.10672688,    0.029162422,
+        -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
+        -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
+        -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
+        0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
+        0.08981,        -0.045407712,   0.08682226,    -0.06867011,
+        -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
+        0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
+        -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
+        0.009352075,    0.22920375,     0.0016303885,  0.11583097,
+        -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
+        0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
+        -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
+        0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
+        -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
+        -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
+        -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
+        -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
+        -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
+        0.01068115,     0.032956902,    0.022433773,   0.0026891115,
+        0.08944216,     -0.0685835,     0.010513544,   0.07228705,
+        0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
+        0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
+        0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
+        -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
+        -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
+        0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
+        -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
+        -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
+        -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
+        -0.017142897,   0.03312627,     0.009205989,   0.024138335,
+        -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
+        -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
+        0.0365468,      0.07590991,     0.08838724,    0.021681072,
+        -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
+        0.023646897,    -0.095322326,   0.02233014,    0.09756986,
+        -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
+        -0.09801813,    0.019894179,    0.08502348,    0.004032281,
+        0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
+        -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
+        -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
+        0.010889619,    0.0047078193,   0.038385306,   0.08540671,
+        -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
+        0.015963363,    0.00871737,     0.060130805,   0.028611384,
+        0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
+        0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
+        0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
+        0.019899689,    0.006106124,    -0.027092824,  0.0786356,
+        0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
+        -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
+        -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
+        -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
+        -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
+        -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
+        0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
+        0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
+        -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
+        0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
+        0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
+        0.058618143,    -0.08598433,    0.00972939,    0.023867095,
+        -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
+        -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
+        0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
+        -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
+        -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
+        0.06358255,     0.18531723,     0.07759293,    0.12006465,
+        0.1305557,      0.058638252,    -0.03393652,   0.09622831,
+        -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
+        -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
+        0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
+        0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
+        0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
+        0.08184801,     -0.019164372,   0.06791302,    0.034257166,
+        -0.10307039,    0.021943003,    0.046745934,   0.0790918,
+        -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
+        -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
+        -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
+        0.026351685,    0.012641483,    0.07466548,    0.044301085,
+        -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
+        -0.04106223,    -0.028126027,   0.028473156,   0.10467447};
+
+    recurrent_to_cell_weights_ = {
+        -0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
+        0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
+        0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
+        -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
+        0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
+        0.08089997,     0.05143358,    0.038261272,   0.03339287,
+        -0.027673481,   0.044746667,   0.028349208,   0.020090483,
+        -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
+        -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
+        -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
+        0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
+        -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
+        -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
+        0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
+        0.010868644,    -0.031489216,  0.09525667,    0.013939797,
+        0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
+        -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
+        0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
+        0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
+        -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
+        0.02786344,     -0.014179351,  0.005264273,   0.14376344,
+        0.015983658,    0.03406988,    -0.06939408,   0.040699873,
+        0.02111075,     0.09669095,    0.041345075,   -0.08316494,
+        -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
+        0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
+        -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
+        0.06760663,     -0.027437469,  0.07216407,    0.06977076,
+        -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
+        0.043184172,    -0.037189785,  0.10420091,    0.00882477,
+        -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
+        0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
+        0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
+        -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
+        0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
+        -0.008264958,   0.042035464,   0.05891794,    0.029673764,
+        0.0063542654,   0.044788733,   0.054816857,   0.062257513,
+        -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
+        -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
+        -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
+        -0.007376126,   0.003533447,   0.006570588,   0.056037236,
+        0.12436656,     0.051817212,   0.028532185,   -0.08686856,
+        0.11868599,     0.07663395,    -0.07323171,   0.03463402,
+        -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
+        0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
+        0.023029093,    0.086124025,   0.006445803,   -0.03496501,
+        0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
+        -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
+        0.09465633,     0.008115513,   -0.02171956,   0.08304309,
+        0.071401566,    0.019622514,   0.032163795,   -0.004167056,
+        0.02295182,     0.030739572,   0.056506045,   0.004612461,
+        0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
+        -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
+        0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
+        -0.0329582,     0.07922767,    0.029322514,   0.026405897,
+        0.04207835,     -0.07073373,   0.063781224,   0.0859677,
+        -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
+        -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
+        -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
+        -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
+        0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
+        0.15978073,     0.10185836,    0.10298046,    -0.015476589,
+        -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
+        -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
+        -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
+        -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
+        -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
+        0.012962922,    -0.031234352,  0.07029052,    0.016418684,
+        0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
+        -0.054761406,   0.029065743,   0.052404847,   0.020238016,
+        0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
+        0.06262858,     0.009184685,   0.020785125,   -0.043904778,
+        -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
+        -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
+        0.09232601,     -0.035886683,  0.06000002,    0.05229691,
+        -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
+        -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
+        0.031502828,    0.036232427,   -0.031581745,  0.023051167,
+        -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
+        -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
+        -0.008799762,   0.056595087,   0.0022273948,  0.055752404};
+
+    recurrent_to_forget_weights_ = {
+        -0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
+        0.14811787,    0.10826372,    0.09471067,     0.03987225,
+        -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
+        0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
+        0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
+        -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
+        -0.06193199,   0.055729095,   0.03736828,     0.020123724,
+        0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
+        -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
+        -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
+        0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
+        -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
+        -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
+        -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
+        0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
+        0.013454138,   0.028934088,   0.01685226,     -0.086110644,
+        -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
+        0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
+        0.03761666,    0.008096139,   -0.014454086,   0.014361001,
+        -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
+        -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
+        0.060212336,   0.055259194,   0.06974018,     0.049454916,
+        -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
+        0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
+        -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
+        0.0042065294,  0.03881498,    0.019844765,    0.041858196,
+        -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
+        0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
+        0.012321099,   0.082840554,   -0.029899208,   0.044217527,
+        0.059855383,   0.07711018,    -0.045319796,   0.0948846,
+        -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
+        -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
+        -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
+        0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
+        0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
+        0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
+        0.052958444,   0.07558703,    0.04817258,     0.044462286,
+        -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
+        0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
+        0.024734668,   0.024614193,   -0.042046934,   0.09597743,
+        -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
+        -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
+        -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
+        0.04383914,    -0.046476185,  0.028658995,    0.060410924,
+        0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
+        0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
+        0.015898481,   0.021362653,   -0.030262267,   0.016587038,
+        -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
+        -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
+        0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
+        -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
+        -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
+        -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
+        -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
+        0.15443139,    0.07684145,    0.036571592,    -0.035900835,
+        -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
+        -0.03858649,   0.01849943,    0.13872518,     0.01503974,
+        0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
+        -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
+        0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
+        0.05866852,    0.023947537,   -0.09445152,    0.035450947,
+        0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
+        0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
+        0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
+        0.051808182,   0.05875331,    -0.04536488,    0.001626336,
+        -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
+        0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
+        -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
+        -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
+        0.11475477,    -0.023854522,  0.10071741,     0.0686208,
+        -0.014250481,  0.034261297,   0.047418304,    0.08562733,
+        -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
+        0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
+        0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
+        0.014410365,   0.020995233,   0.17040324,     0.11511526,
+        0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
+        -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
+        -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
+        0.007076659,   0.10964551,    0.0409152,      0.008275321,
+        -0.07283536,   0.07937492,    0.04192024,     -0.1075027};
+
+    recurrent_to_output_weights_ = {
+        0.025825322,   -0.05813119,   0.09495884,     -0.045984812,
+        -0.01255415,   -0.0026479573, -0.08196161,    -0.054914974,
+        -0.0046604523, -0.029587349,  -0.044576716,   -0.07480124,
+        -0.082868785,  0.023254942,   0.027502948,    -0.0039728214,
+        -0.08683098,   -0.08116779,   -0.014675607,   -0.037924774,
+        -0.023314456,  -0.007401714,  -0.09255757,    0.029460307,
+        -0.08829125,   -0.005139627,  -0.08989442,    -0.0555066,
+        0.13596267,    -0.025062224,  -0.048351806,   -0.03850004,
+        0.07266485,    -0.022414139,  0.05940088,     0.075114764,
+        0.09597592,    -0.010211725,  -0.0049794707,  -0.011523867,
+        -0.025980417,  0.072999895,   0.11091378,     -0.081685916,
+        0.014416728,   0.043229222,   0.034178585,    -0.07530371,
+        0.035837382,   -0.085607,     -0.007721233,   -0.03287832,
+        -0.043848954,  -0.06404588,   -0.06632928,    -0.073643476,
+        0.008214239,   -0.045984086,  0.039764922,    0.03474462,
+        0.060612556,   -0.080590084,  0.049127717,    0.04151091,
+        -0.030063879,  0.008801774,   -0.023021035,   -0.019558564,
+        0.05158114,    -0.010947698,  -0.011825728,   0.0075720972,
+        0.0699727,     -0.0039981045, 0.069350146,    0.08799282,
+        0.016156472,   0.035502106,   0.11695009,     0.006217345,
+        0.13392477,    -0.037875112,  0.025745004,    0.08940699,
+        -0.00924166,   0.0046702605,  -0.036598757,   -0.08811812,
+        0.10522024,    -0.032441203,  0.008176899,    -0.04454919,
+        0.07058152,    0.0067963637,  0.039206743,    0.03259838,
+        0.03725492,    -0.09515802,   0.013326398,    -0.052055415,
+        -0.025676316,  0.03198509,    -0.015951829,   -0.058556724,
+        0.036879618,   0.043357447,   0.028362012,    -0.05908629,
+        0.0059240665,  -0.04995891,   -0.019187413,   0.0276265,
+        -0.01628143,   0.0025863599,  0.08800015,     0.035250366,
+        -0.022165963,  -0.07328642,   -0.009415526,   -0.07455109,
+        0.11690406,    0.0363299,     0.07411125,     0.042103454,
+        -0.009660886,  0.019076364,   0.018299393,    -0.046004917,
+        0.08891175,    0.0431396,     -0.026327137,   -0.051502608,
+        0.08979574,    -0.051670972,  0.04940282,     -0.07491107,
+        -0.021240504,  0.022596184,   -0.034280192,   0.060163025,
+        -0.058211457,  -0.051837247,  -0.01349775,    -0.04639988,
+        -0.035936575,  -0.011681591,  0.064818054,    0.0073146066,
+        -0.021745546,  -0.043124277,  -0.06471268,    -0.07053354,
+        -0.029321948,  -0.05330136,   0.016933719,    -0.053782392,
+        0.13747959,    -0.1361751,    -0.11569455,    0.0033329215,
+        0.05693899,    -0.053219706,  0.063698,       0.07977434,
+        -0.07924483,   0.06936997,    0.0034815092,   -0.007305279,
+        -0.037325785,  -0.07251102,   -0.033633437,   -0.08677009,
+        0.091591336,   -0.14165086,   0.021752775,    0.019683983,
+        0.0011612234,  -0.058154266,  0.049996935,    0.0288841,
+        -0.0024567875, -0.14345716,   0.010955264,    -0.10234828,
+        0.1183656,     -0.0010731248, -0.023590032,   -0.072285876,
+        -0.0724771,    -0.026382286,  -0.0014920527,  0.042667855,
+        0.0018776858,  0.02986552,    0.009814309,    0.0733756,
+        0.12289186,    0.018043943,   -0.0458958,     0.049412545,
+        0.033632483,   0.05495232,    0.036686596,    -0.013781798,
+        -0.010036754,  0.02576849,    -0.08307328,    0.010112348,
+        0.042521734,   -0.05869831,   -0.071689695,   0.03876447,
+        -0.13275425,   -0.0352966,    -0.023077697,   0.10285965,
+        0.084736146,   0.15568255,    -0.00040734606, 0.027835453,
+        -0.10292561,   -0.032401145,  0.10053256,     -0.026142767,
+        -0.08271222,   -0.0030240538, -0.016368777,   0.1070414,
+        0.042672627,   0.013456989,   -0.0437609,     -0.022309763,
+        0.11576483,    0.04108048,    0.061026827,    -0.0190714,
+        -0.0869359,    0.037901703,   0.0610107,      0.07202949,
+        0.01675338,    0.086139716,   -0.08795751,    -0.014898893,
+        -0.023771819,  -0.01965048,   0.007955471,    -0.043740474,
+        0.03346837,    -0.10549954,   0.090567775,    0.042013682,
+        -0.03176985,   0.12569028,    -0.02421228,    -0.029526481,
+        0.023851605,   0.031539805,   0.05292009,     -0.02344001,
+        -0.07811758,   -0.08834428,   0.10094801,     0.16594367,
+        -0.06861939,   -0.021256343,  -0.041093912,   -0.06669611,
+        0.035498552,   0.021757556,   -0.09302526,    -0.015403468,
+        -0.06614931,   -0.051798206,  -0.013874718,   0.03630673,
+        0.010412845,   -0.08077351,   0.046185967,    0.0035662893,
+        0.03541868,    -0.094149634,  -0.034814864,   0.003128424,
+        -0.020674974,  -0.03944324,   -0.008110165,   -0.11113267,
+        0.08484226,    0.043586485,   0.040582247,    0.0968012,
+        -0.065249965,  -0.028036479,  0.0050708856,   0.0017462453,
+        0.0326779,     0.041296225,   0.09164146,     -0.047743853,
+        -0.015952192,  -0.034451712,  0.084197424,    -0.05347844,
+        -0.11768019,   0.085926116,   -0.08251791,    -0.045081906,
+        0.0948852,     0.068401024,   0.024856757,    0.06978981,
+        -0.057309967,  -0.012775832,  -0.0032452994,  0.01977615,
+        -0.041040014,  -0.024264973,  0.063464895,    0.05431621,
+    };
+
+    cell_to_input_weights_ = {
+        0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
+        -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
+        -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
+        0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175};
+
+    cell_to_forget_weights_ = {
+        -0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
+        -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
+        -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
+        0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355};
+
+    cell_to_output_weights_ = {
+        0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
+        -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
+        -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
+        0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733};
+
+    projection_weights_ = {
+        -0.009802181, 0.09401916,   0.0717386,     -0.13895074,
+        0.09641832,   0.060420845,  0.08539281,    0.054285463,
+        0.061395317,  0.034448683,  -0.042991187,  0.019801661,
+        -0.16840284,  -0.015726732, -0.23041931,   -0.024478018,
+        -0.10959692,  -0.013875541, 0.18600968,    -0.061274476,
+        0.0138165,    -0.08160894,  -0.07661644,   0.032372914,
+        0.16169067,   0.22465782,   -0.03993472,   -0.004017731,
+        0.08633481,   -0.28869787,  0.08682067,    0.17240396,
+        0.014975425,  0.056431185,  0.031037588,   0.16702051,
+        0.0077946745, 0.15140012,   0.29405436,    0.120285,
+        -0.188994,    -0.027265169, 0.043389652,   -0.022061434,
+        0.014777949,  -0.20203483,  0.094781205,   0.19100232,
+        0.13987629,   -0.036132768, -0.06426278,   -0.05108664,
+        0.13221376,   0.009441198,  -0.16715929,   0.15859416,
+        -0.040437475, 0.050779544,  -0.022187516,  0.012166504,
+        0.027685808,  -0.07675938,  -0.0055694645, -0.09444123,
+        0.0046453946, 0.050794356,  0.10770313,    -0.20790008,
+        -0.07149004,  -0.11425117,  0.008225835,   -0.035802525,
+        0.14374903,   0.15262283,   0.048710253,   0.1847461,
+        -0.007487823, 0.11000021,   -0.09542012,   0.22619456,
+        -0.029149994, 0.08527916,   0.009043713,   0.0042746216,
+        0.016261552,  0.022461696,  0.12689082,    -0.043589946,
+        -0.12035478,  -0.08361797,  -0.050666027,  -0.1248618,
+        -0.1275799,   -0.071875185, 0.07377272,    0.09944291,
+        -0.18897448,  -0.1593054,   -0.06526116,   -0.040107165,
+        -0.004618631, -0.067624845, -0.007576253,  0.10727444,
+        0.041546922,  -0.20424393,  0.06907816,    0.050412357,
+        0.00724631,   0.039827548,  0.12449835,    0.10747581,
+        0.13708383,   0.09134148,   -0.12617786,   -0.06428341,
+        0.09956831,   0.1208086,    -0.14676677,   -0.0727722,
+        0.1126304,    0.010139365,  0.015571211,   -0.038128063,
+        0.022913318,  -0.042050496, 0.16842307,    -0.060597885,
+        0.10531834,   -0.06411776,  -0.07451711,   -0.03410368,
+        -0.13393489,  0.06534304,   0.003620307,   0.04490757,
+        0.05970546,   0.05197996,   0.02839995,    0.10434969,
+        -0.013699693, -0.028353551, -0.07260381,   0.047201227,
+        -0.024575593, -0.036445823, 0.07155557,    0.009672501,
+        -0.02328883,  0.009533515,  -0.03606021,   -0.07421458,
+        -0.028082801, -0.2678904,   -0.13221288,   0.18419984,
+        -0.13012612,  -0.014588381, -0.035059117,  -0.04824723,
+        0.07830115,   -0.056184657, 0.03277091,    0.025466874,
+        0.14494097,   -0.12522776,  -0.098633975,  -0.10766018,
+        -0.08317623,  0.08594209,   0.07749552,    0.039474737,
+        0.1776665,    -0.07409566,  -0.0477268,    0.29323658,
+        0.10801441,   0.1154011,    0.013952499,   0.10739139,
+        0.10708251,   -0.051456142, 0.0074137426,  -0.10430189,
+        0.10034707,   0.045594677,  0.0635285,     -0.0715442,
+        -0.089667566, -0.10811871,  0.00026344223, 0.08298446,
+        -0.009525053, 0.006585689,  -0.24567553,   -0.09450807,
+        0.09648481,   0.026996298,  -0.06419476,   -0.04752702,
+        -0.11063944,  -0.23441927,  -0.17608605,   -0.052156363,
+        0.067035615,  0.19271925,   -0.0032889997, -0.043264326,
+        0.09663576,   -0.057112187, -0.10100678,   0.0628376,
+        0.04447668,   0.017961001,  -0.10094388,   -0.10190601,
+        0.18335468,   0.10494553,   -0.052095775,  -0.0026118709,
+        0.10539724,   -0.04383912,  -0.042349473,  0.08438151,
+        -0.1947263,   0.02251204,   0.11216432,    -0.10307853,
+        0.17351969,   -0.039091777, 0.08066188,    -0.00561982,
+        0.12633002,   0.11335965,   -0.0088127935, -0.019777594,
+        0.06864014,   -0.059751723, 0.016233567,   -0.06894641,
+        -0.28651384,  -0.004228674, 0.019708522,   -0.16305895,
+        -0.07468996,  -0.0855457,   0.099339016,   -0.07580735,
+        -0.13775392,  0.08434318,   0.08330512,    -0.12131499,
+        0.031935584,  0.09180414,   -0.08876437,   -0.08049874,
+        0.008753825,  0.03498998,   0.030215185,   0.03907079,
+        0.089751154,  0.029194152,  -0.03337423,   -0.019092513,
+        0.04331237,   0.04299654,   -0.036394123,  -0.12915532,
+        0.09793732,   0.07512415,   -0.11319543,   -0.032502122,
+        0.15661901,   0.07671967,   -0.005491124,  -0.19379048,
+        -0.218606,    0.21448623,   0.017840758,   0.1416943,
+        -0.07051762,  0.19488361,   0.02664691,    -0.18104725,
+        -0.09334311,  0.15026465,   -0.15493552,   -0.057762887,
+        -0.11604192,  -0.262013,    -0.01391798,   0.012185008,
+        0.11156489,   -0.07483202,  0.06693364,    -0.26151478,
+        0.046425626,  0.036540434,  -0.16435726,   0.17338543,
+        -0.21401681,  -0.11385144,  -0.08283257,   -0.069031075,
+        0.030635102,  0.010969227,  0.11109743,    0.010919218,
+        0.027526086,  0.13519906,   0.01891392,    -0.046839405,
+        -0.040167913, 0.017953383,  -0.09700955,   0.0061885654,
+        -0.07000971,  0.026893595,  -0.038844477,  0.14543656};
+
+    lstm_input_ = {
+        {// Batch0: 4 (input_sequence_size) * 5 (n_input)
+         0.787926, 0.151646, 0.071352, 0.118426, 0.458058,   // step 0
+         0.596268, 0.998386, 0.568695, 0.864524, 0.571277,   // step 1
+         0.073204, 0.296072, 0.743333, 0.069199, 0.045348,   // step 2
+         0.867394, 0.291279, 0.013714, 0.482521, 0.626339},  // step 3
+
+        {// Batch1: 4 (input_sequence_size) * 5 (n_input)
+         0.295743, 0.544053, 0.690064, 0.858138, 0.497181,  // step 0
+         0.642421, 0.524260, 0.134799, 0.003639, 0.162482,  // step 1
+         0.640394, 0.930399, 0.050782, 0.432485, 0.988078,  // step 2
+         0.082922, 0.563329, 0.865614, 0.333232, 0.259916}  // step 3
+    };
+
+    lstm_golden_output_ = {
+        {// Batch0: 4 (input_sequence_size) * 16 (n_output)
+         -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
+         -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
+         -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
+         0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
+         -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
+         -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
+         0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
+         0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
+         0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
+         0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
+         -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
+         -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
+         0.0286833,   0.00824207,   0.0264887,   0.0305169},
+        {// Batch1: 4 (input_sequence_size) * 16 (n_output)
+         -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
+         -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
+         0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
+         0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
+         -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
+         -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
+         0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
+         0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
+         0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
+         0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
+         -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
+         -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
+         0.0412031,    0.0118723,   0.0239643,   0.0394009}};
   }
-}
+};
 
-TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, LstmBlackBoxTest) {
   const int n_batch = 2;
   const int n_input = 5;
   const int n_cell = 20;
@@ -489,588 +1338,98 @@ TEST(LSTMOpTest, BlackBoxTestWithPeepholeWithProjectionNoClipping) {
                        {0},                 // projection_bias tensor
                    });
 
-  lstm.SetInputToInputWeights(
-      {0.021393683,  0.06124551,    0.046905167,  -0.014657677,  -0.03149463,
-       0.09171803,   0.14647801,    0.10797193,   -0.0057968358, 0.0019193048,
-       -0.2726754,   0.10154029,    -0.018539885, 0.080349885,   -0.10262385,
-       -0.022599787, -0.09121155,   -0.008675967, -0.045206103,  -0.0821282,
-       -0.008045952, 0.015478081,   0.055217247,  0.038719587,   0.044153627,
-       -0.06453243,  0.05031825,    -0.046935108, -0.008164439,  0.014574226,
-       -0.1671009,   -0.15519552,   -0.16819797,  -0.13971269,   -0.11953059,
-       0.25005487,   -0.22790983,   0.009855087,  -0.028140958,  -0.11200698,
-       0.11295408,   -0.0035217577, 0.054485075,  0.05184695,    0.064711206,
-       0.10989193,   0.11674786,    0.03490607,   0.07727357,    0.11390585,
-       -0.1863375,   -0.1034451,    -0.13945189,  -0.049401227,  -0.18767063,
-       0.042483903,  0.14233552,    0.13832581,   0.18350165,    0.14545603,
-       -0.028545704, 0.024939531,   0.050929718,  0.0076203286,  -0.0029723682,
-       -0.042484224, -0.11827596,   -0.09171104,  -0.10808628,   -0.16327988,
-       -0.2273378,   -0.0993647,    -0.017155107, 0.0023917493,  0.049272764,
-       0.0038534778, 0.054764505,   0.089753784,  0.06947234,    0.08014476,
-       -0.04544234,  -0.0497073,    -0.07135631,  -0.048929106,  -0.004042012,
-       -0.009284026, 0.018042054,   0.0036860977, -0.07427302,   -0.11434604,
-       -0.018995456, 0.031487543,   0.012834908,  0.019977754,   0.044256654,
-       -0.39292613,  -0.18519334,   -0.11651281,  -0.06809892,   0.011373677});
-
-  lstm.SetInputToForgetWeights(
-      {-0.0018401089, -0.004852237,  0.03698424,   0.014181704,   0.028273236,
-       -0.016726194,  -0.05249759,   -0.10204261,  0.00861066,    -0.040979505,
-       -0.009899187,  0.01923892,    -0.028177269, -0.08535103,   -0.14585495,
-       0.10662567,    -0.01909731,   -0.017883534, -0.0047269356, -0.045103323,
-       0.0030784295,  0.076784775,   0.07463696,   0.094531395,   0.0814421,
-       -0.12257899,   -0.033945758,  -0.031303465, 0.045630626,   0.06843887,
-       -0.13492945,   -0.012480007,  -0.0811829,   -0.07224499,   -0.09628791,
-       0.045100946,   0.0012300825,  0.013964662,  0.099372394,   0.02543059,
-       0.06958324,    0.034257296,   0.0482646,    0.06267997,    0.052625068,
-       0.12784666,    0.07077897,    0.025725935,  0.04165009,    0.07241905,
-       0.018668644,   -0.037377294,  -0.06277783,  -0.08833636,   -0.040120605,
-       -0.011405586,  -0.007808335,  -0.010301386, -0.005102167,  0.027717464,
-       0.05483423,    0.11449111,    0.11289652,   0.10939839,    0.13396506,
-       -0.08402166,   -0.01901462,   -0.044678304, -0.07720565,   0.014350063,
-       -0.11757958,   -0.0652038,    -0.08185733,  -0.076754324,  -0.092614375,
-       0.10405491,    0.052960336,   0.035755895,  0.035839386,   -0.012540553,
-       0.036881298,   0.02913376,    0.03420159,   0.05448447,    -0.054523353,
-       0.02582715,    0.02327355,    -0.011857179, -0.0011980024, -0.034641717,
-       -0.026125094,  -0.17582615,   -0.15923657,  -0.27486774,   -0.0006143371,
-       0.0001771948,  -8.470171e-05, 0.02651807,   0.045790765,   0.06956496});
-
-  lstm.SetInputToCellWeights(
-      {-0.04580283,   -0.09549462,   -0.032418985,  -0.06454633,
-       -0.043528453,  0.043018587,   -0.049152344,  -0.12418144,
-       -0.078985475,  -0.07596889,   0.019484362,   -0.11434962,
-       -0.0074034138, -0.06314844,   -0.092981495,  0.0062155537,
-       -0.025034338,  -0.0028890965, 0.048929527,   0.06235075,
-       0.10665918,    -0.032036792,  -0.08505916,   -0.10843358,
-       -0.13002433,   -0.036816437,  -0.02130134,   -0.016518239,
-       0.0047691227,  -0.0025825808, 0.066017866,   0.029991534,
-       -0.10652836,   -0.1037554,    -0.13056071,   -0.03266643,
-       -0.033702414,  -0.006473424,  -0.04611692,   0.014419339,
-       -0.025174323,  0.0396852,     0.081777506,   0.06157468,
-       0.10210095,    -0.009658194,  0.046511717,   0.03603906,
-       0.0069369148,  0.015960095,   -0.06507666,   0.09551598,
-       0.053568836,   0.06408714,    0.12835667,    -0.008714329,
-       -0.20211966,   -0.12093674,   0.029450472,   0.2849013,
-       -0.029227901,  0.1164364,     -0.08560263,   0.09941786,
-       -0.036999565,  -0.028842626,  -0.0033637602, -0.017012902,
-       -0.09720865,   -0.11193351,   -0.029155117,  -0.017936034,
-       -0.009768936,  -0.04223324,   -0.036159635,  0.06505112,
-       -0.021742892,  -0.023377212,  -0.07221364,   -0.06430552,
-       0.05453865,    0.091149814,   0.06387331,    0.007518393,
-       0.055960953,   0.069779344,   0.046411168,   0.10509911,
-       0.07463894,    0.0075130584,  0.012850982,   0.04555431,
-       0.056955688,   0.06555285,    0.050801456,   -0.009862683,
-       0.00826772,    -0.026555609,  -0.0073611983, -0.0014897042});
-
-  lstm.SetInputToOutputWeights(
-      {-0.0998932,   -0.07201956,  -0.052803773,  -0.15629593,  -0.15001918,
-       -0.07650751,  0.02359855,   -0.075155355,  -0.08037709,  -0.15093534,
-       0.029517552,  -0.04751393,  0.010350531,   -0.02664851,  -0.016839722,
-       -0.023121163, 0.0077019283, 0.012851257,   -0.05040649,  -0.0129761,
-       -0.021737747, -0.038305793, -0.06870586,   -0.01481247,  -0.001285394,
-       0.10124236,   0.083122835,  0.053313006,   -0.062235646, -0.075637154,
-       -0.027833903, 0.029774971,  0.1130802,     0.09218906,   0.09506135,
-       -0.086665764, -0.037162706, -0.038880914,  -0.035832845, -0.014481564,
-       -0.09825003,  -0.12048569,  -0.097665586,  -0.05287633,  -0.0964047,
-       -0.11366429,  0.035777505,  0.13568819,    0.052451383,  0.050649304,
-       0.05798951,   -0.021852335, -0.099848844,  0.014740475,  -0.078897946,
-       0.04974699,   0.014160473,  0.06973932,    0.04964942,   0.033364646,
-       0.08190124,   0.025535367,  0.050893165,   0.048514254,  0.06945813,
-       -0.078907564, -0.06707616,  -0.11844508,   -0.09986688,  -0.07509403,
-       0.06263226,   0.14925587,   0.20188436,    0.12098451,   0.14639415,
-       0.0015017595, -0.014267382, -0.03417257,   0.012711468,  0.0028300495,
-       -0.024758482, -0.05098548,  -0.0821182,    0.014225672,  0.021544158,
-       0.08949725,   0.07505268,   -0.0020780868, 0.04908258,   0.06476295,
-       -0.022907063, 0.027562456,  0.040185735,   0.019567577,  -0.015598739,
-       -0.049097303, -0.017121866, -0.083368234,  -0.02332002,  -0.0840956});
-
-  lstm.SetInputGateBias(
-      {0.02234832,  0.14757581,   0.18176508,  0.10380666,  0.053110216,
-       -0.06928846, -0.13942584,  -0.11816189, 0.19483899,  0.03652339,
-       -0.10250295, 0.036714908,  -0.18426876, 0.036065217, 0.21810818,
-       0.02383196,  -0.043370757, 0.08690144,  -0.04444982, 0.00030581196});
-
-  lstm.SetForgetGateBias({0.035185695, -0.042891346, -0.03032477, 0.23027696,
-                          0.11098921,  0.15378423,   0.09263801,  0.09790885,
-                          0.09508917,  0.061199076,  0.07665568,  -0.015443159,
-                          -0.03499149, 0.046190713,  0.08895977,  0.10899629,
-                          0.40694186,  0.06030037,   0.012413437, -0.06108739});
-
-  lstm.SetCellBias({-0.024379363, 0.0055531194, 0.23377132,   0.033463873,
-                    -0.1483596,   -0.10639995,  -0.091433935, 0.058573797,
-                    -0.06809782,  -0.07889636,  -0.043246906, -0.09829136,
-                    -0.4279842,   0.034901652,  0.18797937,   0.0075234566,
-                    0.016178843,  0.1749513,    0.13975595,   0.92058027});
-
-  lstm.SetOutputGateBias(
-      {0.046159424,  -0.0012809046, 0.03563469,   0.12648113, 0.027195795,
-       0.35373217,   -0.018957434,  0.008907322,  -0.0762701, 0.12018895,
-       0.04216877,   0.0022856654,  0.040952638,  0.3147856,  0.08225149,
-       -0.057416286, -0.14995944,   -0.008040261, 0.13208859, 0.029760877});
-
-  lstm.SetRecurrentToInputWeights(
-      {-0.001374326,   -0.078856036,   0.10672688,    0.029162422,
-       -0.11585556,    0.02557986,     -0.13446963,   -0.035785314,
-       -0.01244275,    0.025961924,    -0.02337298,   -0.044228926,
-       -0.055839065,   -0.046598054,   -0.010546039,  -0.06900766,
-       0.027239809,    0.022582639,    -0.013296484,  -0.05459212,
-       0.08981,        -0.045407712,   0.08682226,    -0.06867011,
-       -0.14390695,    -0.02916037,    0.000996957,   0.091420636,
-       0.14283475,     -0.07390571,    -0.06402044,   0.062524505,
-       -0.093129106,   0.04860203,     -0.08364217,   -0.08119002,
-       0.009352075,    0.22920375,     0.0016303885,  0.11583097,
-       -0.13732095,    0.012405723,    -0.07551853,   0.06343048,
-       0.12162708,     -0.031923793,   -0.014335606,  0.01790974,
-       -0.10650317,    -0.0724401,     0.08554849,    -0.05727212,
-       0.06556731,     -0.042729504,   -0.043227166,  0.011683251,
-       -0.013082158,   -0.029302018,   -0.010899579,  -0.062036745,
-       -0.022509435,   -0.00964907,    -0.01567329,   0.04260106,
-       -0.07787477,    -0.11576462,    0.017356863,   0.048673786,
-       -0.017577527,   -0.05527947,    -0.082487635,  -0.040137455,
-       -0.10820036,    -0.04666372,    0.022746278,   -0.07851417,
-       0.01068115,     0.032956902,    0.022433773,   0.0026891115,
-       0.08944216,     -0.0685835,     0.010513544,   0.07228705,
-       0.02032331,     -0.059686817,   -0.0005566496, -0.086984694,
-       0.040414046,    -0.1380399,     0.094208956,   -0.05722982,
-       0.012092817,    -0.04989123,    -0.086576,     -0.003399834,
-       -0.04696032,    -0.045747425,   0.10091314,    0.048676282,
-       -0.029037097,   0.031399418,    -0.0040285117, 0.047237843,
-       0.09504992,     0.041799378,    -0.049185462,  -0.031518843,
-       -0.10516937,    0.026374253,    0.10058866,    -0.0033195973,
-       -0.041975245,   0.0073591834,   0.0033782164,  -0.004325073,
-       -0.10167381,    0.042500053,    -0.01447153,   0.06464186,
-       -0.017142897,   0.03312627,     0.009205989,   0.024138335,
-       -0.011337001,   0.035530265,    -0.010912711,  0.0706555,
-       -0.005894094,   0.051841937,    -0.1401738,    -0.02351249,
-       0.0365468,      0.07590991,     0.08838724,    0.021681072,
-       -0.10086113,    0.019608743,    -0.06195883,   0.077335775,
-       0.023646897,    -0.095322326,   0.02233014,    0.09756986,
-       -0.048691444,   -0.009579111,   0.07595467,    0.11480546,
-       -0.09801813,    0.019894179,    0.08502348,    0.004032281,
-       0.037211012,    0.068537936,    -0.048005626,  -0.091520436,
-       -0.028379958,   -0.01556313,    0.06554592,    -0.045599163,
-       -0.01672207,    -0.020169014,   -0.011877351,  -0.20212261,
-       0.010889619,    0.0047078193,   0.038385306,   0.08540671,
-       -0.017140968,   -0.0035865551,  0.016678626,   0.005633034,
-       0.015963363,    0.00871737,     0.060130805,   0.028611384,
-       0.10109069,     -0.015060172,   -0.07894427,   0.06401885,
-       0.011584063,    -0.024466386,   0.0047652307,  -0.09041358,
-       0.030737216,    -0.0046374933,  0.14215417,    -0.11823516,
-       0.019899689,    0.006106124,    -0.027092824,  0.0786356,
-       0.05052217,     -0.058925,      -0.011402121,  -0.024987547,
-       -0.0013661642,  -0.06832946,    -0.015667673,  -0.1083353,
-       -0.00096863037, -0.06988685,    -0.053350925,  -0.027275559,
-       -0.033664223,   -0.07978348,    -0.025200296,  -0.017207067,
-       -0.058403496,   -0.055697463,   0.005798788,   0.12965427,
-       -0.062582195,   0.0013350133,   -0.10482091,   0.0379771,
-       0.072521195,    -0.0029455067,  -0.13797039,   -0.03628521,
-       0.013806405,    -0.017858358,   -0.01008298,   -0.07700066,
-       -0.017081132,   0.019358726,    0.0027079724,  0.004635139,
-       0.062634714,    -0.02338735,    -0.039547626,  -0.02050681,
-       0.03385117,     -0.083611414,   0.002862572,   -0.09421313,
-       0.058618143,    -0.08598433,    0.00972939,    0.023867095,
-       -0.053934585,   -0.023203006,   0.07452513,    -0.048767887,
-       -0.07314807,    -0.056307215,   -0.10433547,   -0.06440842,
-       0.04328182,     0.04389765,     -0.020006588,  -0.09076438,
-       -0.11652589,    -0.021705797,   0.03345259,    -0.010329105,
-       -0.025767034,   0.013057034,    -0.07316461,   -0.10145612,
-       0.06358255,     0.18531723,     0.07759293,    0.12006465,
-       0.1305557,      0.058638252,    -0.03393652,   0.09622831,
-       -0.16253184,    -2.4580743e-06, 0.079869635,   -0.070196845,
-       -0.005644518,   0.06857898,     -0.12598175,   -0.035084512,
-       0.03156317,     -0.12794146,    -0.031963028,  0.04692781,
-       0.030070418,    0.0071660685,   -0.095516115,  -0.004643372,
-       0.040170413,    -0.062104587,   -0.0037324072, 0.0554317,
-       0.08184801,     -0.019164372,   0.06791302,    0.034257166,
-       -0.10307039,    0.021943003,    0.046745934,   0.0790918,
-       -0.0265588,     -0.007824208,   0.042546265,   -0.00977924,
-       -0.0002440307,  -0.017384544,   -0.017990116,  0.12252321,
-       -0.014512694,   -0.08251313,    0.08861942,    0.13589665,
-       0.026351685,    0.012641483,    0.07466548,    0.044301085,
-       -0.045414884,   -0.051112458,   0.03444247,    -0.08502782,
-       -0.04106223,    -0.028126027,   0.028473156,   0.10467447});
-
-  lstm.SetRecurrentToForgetWeights(
-      {-0.057784554,  -0.026057621,  -0.068447545,   -0.022581743,
-       0.14811787,    0.10826372,    0.09471067,     0.03987225,
-       -0.0039523416, 0.00030638507, 0.053185795,    0.10572994,
-       0.08414449,    -0.022036452,  -0.00066928595, -0.09203576,
-       0.032950465,   -0.10985798,   -0.023809856,   0.0021431844,
-       -0.02196096,   -0.00326074,   0.00058621005,  -0.074678116,
-       -0.06193199,   0.055729095,   0.03736828,     0.020123724,
-       0.061878487,   -0.04729229,   0.034919553,    -0.07585433,
-       -0.04421272,   -0.044019096,  0.085488975,    0.04058006,
-       -0.06890133,   -0.030951202,  -0.024628663,   -0.07672815,
-       0.034293607,   0.08556707,    -0.05293577,    -0.033561368,
-       -0.04899627,   0.0241671,     0.015736353,    -0.095442444,
-       -0.029564252,  0.016493602,   -0.035026584,   0.022337519,
-       -0.026871363,  0.004780428,   0.0077918363,   -0.03601621,
-       0.016435321,   -0.03263031,   -0.09543275,    -0.047392778,
-       0.013454138,   0.028934088,   0.01685226,     -0.086110644,
-       -0.046250615,  -0.01847454,   0.047608484,    0.07339695,
-       0.034546845,   -0.04881143,   0.009128804,    -0.08802852,
-       0.03761666,    0.008096139,   -0.014454086,   0.014361001,
-       -0.023502491,  -0.0011840804, -0.07607001,    0.001856849,
-       -0.06509276,   -0.006021153,  -0.08570962,    -0.1451793,
-       0.060212336,   0.055259194,   0.06974018,     0.049454916,
-       -0.027794661,  -0.08077226,   -0.016179763,   0.1169753,
-       0.17213494,    -0.0056326236, -0.053934924,   -0.0124349,
-       -0.11520337,   0.05409887,    0.088759385,    0.0019655675,
-       0.0042065294,  0.03881498,    0.019844765,    0.041858196,
-       -0.05695512,   0.047233116,   0.038937137,    -0.06542224,
-       0.014429736,   -0.09719407,   0.13908425,     -0.05379757,
-       0.012321099,   0.082840554,   -0.029899208,   0.044217527,
-       0.059855383,   0.07711018,    -0.045319796,   0.0948846,
-       -0.011724666,  -0.0033288454, -0.033542685,   -0.04764985,
-       -0.13873616,   0.040668588,   0.034832682,    -0.015319203,
-       -0.018715994,  0.046002675,   0.0599172,      -0.043107376,
-       0.0294216,     -0.002314414,  -0.022424703,   0.0030315618,
-       0.0014641669,  0.0029166266,  -0.11878115,    0.013738511,
-       0.12375372,    -0.0006038222, 0.029104086,    0.087442465,
-       0.052958444,   0.07558703,    0.04817258,     0.044462286,
-       -0.015213451,  -0.08783778,   -0.0561384,     -0.003008196,
-       0.047060397,   -0.002058388,  0.03429439,     -0.018839769,
-       0.024734668,   0.024614193,   -0.042046934,   0.09597743,
-       -0.0043254104, 0.04320769,    0.0064070094,   -0.0019131786,
-       -0.02558259,   -0.022822596,  -0.023273505,   -0.02464396,
-       -0.10991725,   -0.006240552,  0.0074488563,   0.024044557,
-       0.04383914,    -0.046476185,  0.028658995,    0.060410924,
-       0.050786525,   0.009452605,   -0.0073054377,  -0.024810238,
-       0.0052906186,  0.0066939713,  -0.0020913032,  0.014515517,
-       0.015898481,   0.021362653,   -0.030262267,   0.016587038,
-       -0.011442813,  0.041154444,   -0.007631438,   -0.03423484,
-       -0.010977775,  0.036152758,   0.0066366293,   0.11915515,
-       0.02318443,    -0.041350313,  0.021485701,    -0.10906167,
-       -0.028218046,  -0.00954771,   0.020531068,    -0.11995105,
-       -0.03672871,   0.024019798,   0.014255957,    -0.05221243,
-       -0.00661567,   -0.04630967,   0.033188973,    0.10107534,
-       -0.014027541,  0.030796422,   -0.10270911,    -0.035999842,
-       0.15443139,    0.07684145,    0.036571592,    -0.035900835,
-       -0.0034699554, 0.06209149,    0.015920248,    -0.031122351,
-       -0.03858649,   0.01849943,    0.13872518,     0.01503974,
-       0.069941424,   -0.06948533,   -0.0088794185,  0.061282158,
-       -0.047401894,  0.03100163,    -0.041533746,   -0.10430945,
-       0.044574402,   -0.01425562,   -0.024290353,   0.034563623,
-       0.05866852,    0.023947537,   -0.09445152,    0.035450947,
-       0.02247216,    -0.0042998926, 0.061146557,    -0.10250651,
-       0.020881841,   -0.06747029,   0.10062043,     -0.0023941975,
-       0.03532124,    -0.016341697,  0.09685456,     -0.016764693,
-       0.051808182,   0.05875331,    -0.04536488,    0.001626336,
-       -0.028892258,  -0.01048663,   -0.009793449,   -0.017093895,
-       0.010987891,   0.02357273,    -0.00010856845, 0.0099760275,
-       -0.001845119,  -0.03551521,   0.0018358806,   0.05763657,
-       -0.01769146,   0.040995963,   0.02235177,     -0.060430344,
-       0.11475477,    -0.023854522,  0.10071741,     0.0686208,
-       -0.014250481,  0.034261297,   0.047418304,    0.08562733,
-       -0.030519066,  0.0060542435,  0.014653856,    -0.038836084,
-       0.04096551,    0.032249358,   -0.08355519,    -0.026823482,
-       0.056386515,   -0.010401743,  -0.028396193,   0.08507674,
-       0.014410365,   0.020995233,   0.17040324,     0.11511526,
-       0.02459721,    0.0066619175,  0.025853224,    -0.023133837,
-       -0.081302024,  0.017264642,   -0.009585969,   0.09491168,
-       -0.051313367,  0.054532815,   -0.014298593,   0.10657464,
-       0.007076659,   0.10964551,    0.0409152,      0.008275321,
-       -0.07283536,   0.07937492,    0.04192024,     -0.1075027});
-
-  lstm.SetRecurrentToCellWeights(
-      {-0.037322544,   0.018592842,   0.0056175636,  -0.06253426,
-       0.055647098,    -0.05713207,   -0.05626563,   0.005559383,
-       0.03375411,     -0.025757805,  -0.088049285,  0.06017052,
-       -0.06570978,    0.007384076,   0.035123326,   -0.07920549,
-       0.053676967,    0.044480428,   -0.07663568,   0.0071805613,
-       0.08089997,     0.05143358,    0.038261272,   0.03339287,
-       -0.027673481,   0.044746667,   0.028349208,   0.020090483,
-       -0.019443132,   -0.030755889,  -0.0040000007, 0.04465846,
-       -0.021585021,   0.0031670958,  0.0053199246,  -0.056117613,
-       -0.10893326,    0.076739706,   -0.08509834,   -0.027997585,
-       0.037871376,    0.01449768,    -0.09002357,   -0.06111149,
-       -0.046195522,   0.0422062,     -0.005683705,  -0.1253618,
-       -0.012925729,   -0.04890792,   0.06985068,    0.037654128,
-       0.03398274,     -0.004781977,  0.007032333,   -0.031787455,
-       0.010868644,    -0.031489216,  0.09525667,    0.013939797,
-       0.0058680447,   0.0167067,     0.02668468,    -0.04797466,
-       -0.048885044,   -0.12722108,   0.035304096,   0.06554885,
-       0.00972396,     -0.039238118,  -0.05159735,   -0.11329045,
-       0.1613692,      -0.03750952,   0.06529313,    -0.071974665,
-       -0.11769596,    0.015524369,   -0.0013754242, -0.12446318,
-       0.02786344,     -0.014179351,  0.005264273,   0.14376344,
-       0.015983658,    0.03406988,    -0.06939408,   0.040699873,
-       0.02111075,     0.09669095,    0.041345075,   -0.08316494,
-       -0.07684199,    -0.045768797,  0.032298047,   -0.041805092,
-       0.0119405,      0.0061010392,  0.12652606,    0.0064572375,
-       -0.024950314,   0.11574242,    0.04508852,    -0.04335324,
-       0.06760663,     -0.027437469,  0.07216407,    0.06977076,
-       -0.05438599,    0.034033038,   -0.028602652,  0.05346137,
-       0.043184172,    -0.037189785,  0.10420091,    0.00882477,
-       -0.054019816,   -0.074273005,  -0.030617684,  -0.0028467078,
-       0.024302477,    -0.0038869337, 0.005332455,   0.0013399826,
-       0.04361412,     -0.007001822,  0.09631092,    -0.06702025,
-       -0.042049985,   -0.035070654,  -0.04103342,   -0.10273396,
-       0.0544271,      0.037184782,   -0.13150354,   -0.0058036847,
-       -0.008264958,   0.042035464,   0.05891794,    0.029673764,
-       0.0063542654,   0.044788733,   0.054816857,   0.062257513,
-       -0.00093483756, 0.048938446,   -0.004952862,  -0.007730018,
-       -0.04043371,    -0.017094059,  0.07229206,    -0.023670016,
-       -0.052195564,   -0.025616996,  -0.01520939,   0.045104615,
-       -0.007376126,   0.003533447,   0.006570588,   0.056037236,
-       0.12436656,     0.051817212,   0.028532185,   -0.08686856,
-       0.11868599,     0.07663395,    -0.07323171,   0.03463402,
-       -0.050708205,   -0.04458982,   -0.11590894,   0.021273347,
-       0.1251325,      -0.15313013,   -0.12224372,   0.17228661,
-       0.023029093,    0.086124025,   0.006445803,   -0.03496501,
-       0.028332196,    0.04449512,    -0.042436164,  -0.026587414,
-       -0.006041347,   -0.09292539,   -0.05678812,   0.03897832,
-       0.09465633,     0.008115513,   -0.02171956,   0.08304309,
-       0.071401566,    0.019622514,   0.032163795,   -0.004167056,
-       0.02295182,     0.030739572,   0.056506045,   0.004612461,
-       0.06524936,     0.059999723,   0.046395954,   -0.0045512207,
-       -0.1335546,     -0.030136576,  0.11584653,    -0.014678886,
-       0.0020118146,   -0.09688814,   -0.0790206,    0.039770417,
-       -0.0329582,     0.07922767,    0.029322514,   0.026405897,
-       0.04207835,     -0.07073373,   0.063781224,   0.0859677,
-       -0.10925287,    -0.07011058,   0.048005477,   0.03438226,
-       -0.09606514,    -0.006669445,  -0.043381985,  0.04240257,
-       -0.06955775,    -0.06769346,   0.043903265,   -0.026784198,
-       -0.017840602,   0.024307009,   -0.040079936,  -0.019946516,
-       0.045318738,    -0.12233574,   0.026170589,   0.0074471775,
-       0.15978073,     0.10185836,    0.10298046,    -0.015476589,
-       -0.039390966,   -0.072174534,  0.0739445,     -0.1211869,
-       -0.0347889,     -0.07943156,   0.014809798,   -0.12412325,
-       -0.0030663363,  0.039695457,   0.0647603,     -0.08291318,
-       -0.018529687,   -0.004423833,  0.0037507233,  0.084633216,
-       -0.01514876,    -0.056505352,  -0.012800942,  -0.06994386,
-       0.012962922,    -0.031234352,  0.07029052,    0.016418684,
-       0.03618972,     0.055686004,   -0.08663945,   -0.017404709,
-       -0.054761406,   0.029065743,   0.052404847,   0.020238016,
-       0.0048197987,   -0.0214882,    0.07078733,    0.013016777,
-       0.06262858,     0.009184685,   0.020785125,   -0.043904778,
-       -0.0270329,     -0.03299152,   -0.060088247,  -0.015162964,
-       -0.001828936,   0.12642565,    -0.056757294,  0.013586685,
-       0.09232601,     -0.035886683,  0.06000002,    0.05229691,
-       -0.052580316,   -0.082029596,  -0.010794592,  0.012947712,
-       -0.036429964,   -0.085508935,  -0.13127148,   -0.017744139,
-       0.031502828,    0.036232427,   -0.031581745,  0.023051167,
-       -0.05325106,    -0.03421577,   0.028793324,   -0.034633752,
-       -0.009881397,   -0.043551125,  -0.018609839,  0.0019097115,
-       -0.008799762,   0.056595087,   0.0022273948,  0.055752404});
-
-  lstm.SetRecurrentToOutputWeights({
-      0.025825322,   -0.05813119,  0.09495884,   -0.045984812,   -0.01255415,
-      -0.0026479573, -0.08196161,  -0.054914974, -0.0046604523,  -0.029587349,
-      -0.044576716,  -0.07480124,  -0.082868785, 0.023254942,    0.027502948,
-      -0.0039728214, -0.08683098,  -0.08116779,  -0.014675607,   -0.037924774,
-      -0.023314456,  -0.007401714, -0.09255757,  0.029460307,    -0.08829125,
-      -0.005139627,  -0.08989442,  -0.0555066,   0.13596267,     -0.025062224,
-      -0.048351806,  -0.03850004,  0.07266485,   -0.022414139,   0.05940088,
-      0.075114764,   0.09597592,   -0.010211725, -0.0049794707,  -0.011523867,
-      -0.025980417,  0.072999895,  0.11091378,   -0.081685916,   0.014416728,
-      0.043229222,   0.034178585,  -0.07530371,  0.035837382,    -0.085607,
-      -0.007721233,  -0.03287832,  -0.043848954, -0.06404588,    -0.06632928,
-      -0.073643476,  0.008214239,  -0.045984086, 0.039764922,    0.03474462,
-      0.060612556,   -0.080590084, 0.049127717,  0.04151091,     -0.030063879,
-      0.008801774,   -0.023021035, -0.019558564, 0.05158114,     -0.010947698,
-      -0.011825728,  0.0075720972, 0.0699727,    -0.0039981045,  0.069350146,
-      0.08799282,    0.016156472,  0.035502106,  0.11695009,     0.006217345,
-      0.13392477,    -0.037875112, 0.025745004,  0.08940699,     -0.00924166,
-      0.0046702605,  -0.036598757, -0.08811812,  0.10522024,     -0.032441203,
-      0.008176899,   -0.04454919,  0.07058152,   0.0067963637,   0.039206743,
-      0.03259838,    0.03725492,   -0.09515802,  0.013326398,    -0.052055415,
-      -0.025676316,  0.03198509,   -0.015951829, -0.058556724,   0.036879618,
-      0.043357447,   0.028362012,  -0.05908629,  0.0059240665,   -0.04995891,
-      -0.019187413,  0.0276265,    -0.01628143,  0.0025863599,   0.08800015,
-      0.035250366,   -0.022165963, -0.07328642,  -0.009415526,   -0.07455109,
-      0.11690406,    0.0363299,    0.07411125,   0.042103454,    -0.009660886,
-      0.019076364,   0.018299393,  -0.046004917, 0.08891175,     0.0431396,
-      -0.026327137,  -0.051502608, 0.08979574,   -0.051670972,   0.04940282,
-      -0.07491107,   -0.021240504, 0.022596184,  -0.034280192,   0.060163025,
-      -0.058211457,  -0.051837247, -0.01349775,  -0.04639988,    -0.035936575,
-      -0.011681591,  0.064818054,  0.0073146066, -0.021745546,   -0.043124277,
-      -0.06471268,   -0.07053354,  -0.029321948, -0.05330136,    0.016933719,
-      -0.053782392,  0.13747959,   -0.1361751,   -0.11569455,    0.0033329215,
-      0.05693899,    -0.053219706, 0.063698,     0.07977434,     -0.07924483,
-      0.06936997,    0.0034815092, -0.007305279, -0.037325785,   -0.07251102,
-      -0.033633437,  -0.08677009,  0.091591336,  -0.14165086,    0.021752775,
-      0.019683983,   0.0011612234, -0.058154266, 0.049996935,    0.0288841,
-      -0.0024567875, -0.14345716,  0.010955264,  -0.10234828,    0.1183656,
-      -0.0010731248, -0.023590032, -0.072285876, -0.0724771,     -0.026382286,
-      -0.0014920527, 0.042667855,  0.0018776858, 0.02986552,     0.009814309,
-      0.0733756,     0.12289186,   0.018043943,  -0.0458958,     0.049412545,
-      0.033632483,   0.05495232,   0.036686596,  -0.013781798,   -0.010036754,
-      0.02576849,    -0.08307328,  0.010112348,  0.042521734,    -0.05869831,
-      -0.071689695,  0.03876447,   -0.13275425,  -0.0352966,     -0.023077697,
-      0.10285965,    0.084736146,  0.15568255,   -0.00040734606, 0.027835453,
-      -0.10292561,   -0.032401145, 0.10053256,   -0.026142767,   -0.08271222,
-      -0.0030240538, -0.016368777, 0.1070414,    0.042672627,    0.013456989,
-      -0.0437609,    -0.022309763, 0.11576483,   0.04108048,     0.061026827,
-      -0.0190714,    -0.0869359,   0.037901703,  0.0610107,      0.07202949,
-      0.01675338,    0.086139716,  -0.08795751,  -0.014898893,   -0.023771819,
-      -0.01965048,   0.007955471,  -0.043740474, 0.03346837,     -0.10549954,
-      0.090567775,   0.042013682,  -0.03176985,  0.12569028,     -0.02421228,
-      -0.029526481,  0.023851605,  0.031539805,  0.05292009,     -0.02344001,
-      -0.07811758,   -0.08834428,  0.10094801,   0.16594367,     -0.06861939,
-      -0.021256343,  -0.041093912, -0.06669611,  0.035498552,    0.021757556,
-      -0.09302526,   -0.015403468, -0.06614931,  -0.051798206,   -0.013874718,
-      0.03630673,    0.010412845,  -0.08077351,  0.046185967,    0.0035662893,
-      0.03541868,    -0.094149634, -0.034814864, 0.003128424,    -0.020674974,
-      -0.03944324,   -0.008110165, -0.11113267,  0.08484226,     0.043586485,
-      0.040582247,   0.0968012,    -0.065249965, -0.028036479,   0.0050708856,
-      0.0017462453,  0.0326779,    0.041296225,  0.09164146,     -0.047743853,
-      -0.015952192,  -0.034451712, 0.084197424,  -0.05347844,    -0.11768019,
-      0.085926116,   -0.08251791,  -0.045081906, 0.0948852,      0.068401024,
-      0.024856757,   0.06978981,   -0.057309967, -0.012775832,   -0.0032452994,
-      0.01977615,    -0.041040014, -0.024264973, 0.063464895,    0.05431621,
-  });
-
-  lstm.SetCellToInputWeights(
-      {0.040369894, 0.030746894,  0.24704495,  0.018586371,  -0.037586458,
-       -0.15312155, -0.11812848,  -0.11465643, 0.20259799,   0.11418174,
-       -0.10116027, -0.011334949, 0.12411352,  -0.076769054, -0.052169047,
-       0.21198851,  -0.38871562,  -0.09061183, -0.09683246,  -0.21929175});
-
-  lstm.SetCellToForgetWeights(
-      {-0.01998659,  -0.15568835,  -0.24248174,   -0.012770197, 0.041331276,
-       -0.072311886, -0.052123554, -0.0066330447, -0.043891653, 0.036225766,
-       -0.047248036, 0.021479502,  0.033189066,   0.11952997,   -0.020432774,
-       0.64658105,   -0.06650122,  -0.03467612,   0.095340036,  0.23647355});
-
-  lstm.SetCellToOutputWeights(
-      {0.08286371,  -0.08261836, -0.51210177, 0.002913762, 0.17764764,
-       -0.5495371,  -0.08460716, -0.24552552, 0.030037103, 0.04123544,
-       -0.11940523, 0.007358328, 0.1890978,   0.4833202,   -0.34441817,
-       0.36312827,  -0.26375428, 0.1457655,   -0.19724406, 0.15548733});
-
-  lstm.SetProjectionWeights(
-      {-0.009802181,  0.09401916,    0.0717386,     -0.13895074,  0.09641832,
-       0.060420845,   0.08539281,    0.054285463,   0.061395317,  0.034448683,
-       -0.042991187,  0.019801661,   -0.16840284,   -0.015726732, -0.23041931,
-       -0.024478018,  -0.10959692,   -0.013875541,  0.18600968,   -0.061274476,
-       0.0138165,     -0.08160894,   -0.07661644,   0.032372914,  0.16169067,
-       0.22465782,    -0.03993472,   -0.004017731,  0.08633481,   -0.28869787,
-       0.08682067,    0.17240396,    0.014975425,   0.056431185,  0.031037588,
-       0.16702051,    0.0077946745,  0.15140012,    0.29405436,   0.120285,
-       -0.188994,     -0.027265169,  0.043389652,   -0.022061434, 0.014777949,
-       -0.20203483,   0.094781205,   0.19100232,    0.13987629,   -0.036132768,
-       -0.06426278,   -0.05108664,   0.13221376,    0.009441198,  -0.16715929,
-       0.15859416,    -0.040437475,  0.050779544,   -0.022187516, 0.012166504,
-       0.027685808,   -0.07675938,   -0.0055694645, -0.09444123,  0.0046453946,
-       0.050794356,   0.10770313,    -0.20790008,   -0.07149004,  -0.11425117,
-       0.008225835,   -0.035802525,  0.14374903,    0.15262283,   0.048710253,
-       0.1847461,     -0.007487823,  0.11000021,    -0.09542012,  0.22619456,
-       -0.029149994,  0.08527916,    0.009043713,   0.0042746216, 0.016261552,
-       0.022461696,   0.12689082,    -0.043589946,  -0.12035478,  -0.08361797,
-       -0.050666027,  -0.1248618,    -0.1275799,    -0.071875185, 0.07377272,
-       0.09944291,    -0.18897448,   -0.1593054,    -0.06526116,  -0.040107165,
-       -0.004618631,  -0.067624845,  -0.007576253,  0.10727444,   0.041546922,
-       -0.20424393,   0.06907816,    0.050412357,   0.00724631,   0.039827548,
-       0.12449835,    0.10747581,    0.13708383,    0.09134148,   -0.12617786,
-       -0.06428341,   0.09956831,    0.1208086,     -0.14676677,  -0.0727722,
-       0.1126304,     0.010139365,   0.015571211,   -0.038128063, 0.022913318,
-       -0.042050496,  0.16842307,    -0.060597885,  0.10531834,   -0.06411776,
-       -0.07451711,   -0.03410368,   -0.13393489,   0.06534304,   0.003620307,
-       0.04490757,    0.05970546,    0.05197996,    0.02839995,   0.10434969,
-       -0.013699693,  -0.028353551,  -0.07260381,   0.047201227,  -0.024575593,
-       -0.036445823,  0.07155557,    0.009672501,   -0.02328883,  0.009533515,
-       -0.03606021,   -0.07421458,   -0.028082801,  -0.2678904,   -0.13221288,
-       0.18419984,    -0.13012612,   -0.014588381,  -0.035059117, -0.04824723,
-       0.07830115,    -0.056184657,  0.03277091,    0.025466874,  0.14494097,
-       -0.12522776,   -0.098633975,  -0.10766018,   -0.08317623,  0.08594209,
-       0.07749552,    0.039474737,   0.1776665,     -0.07409566,  -0.0477268,
-       0.29323658,    0.10801441,    0.1154011,     0.013952499,  0.10739139,
-       0.10708251,    -0.051456142,  0.0074137426,  -0.10430189,  0.10034707,
-       0.045594677,   0.0635285,     -0.0715442,    -0.089667566, -0.10811871,
-       0.00026344223, 0.08298446,    -0.009525053,  0.006585689,  -0.24567553,
-       -0.09450807,   0.09648481,    0.026996298,   -0.06419476,  -0.04752702,
-       -0.11063944,   -0.23441927,   -0.17608605,   -0.052156363, 0.067035615,
-       0.19271925,    -0.0032889997, -0.043264326,  0.09663576,   -0.057112187,
-       -0.10100678,   0.0628376,     0.04447668,    0.017961001,  -0.10094388,
-       -0.10190601,   0.18335468,    0.10494553,    -0.052095775, -0.0026118709,
-       0.10539724,    -0.04383912,   -0.042349473,  0.08438151,   -0.1947263,
-       0.02251204,    0.11216432,    -0.10307853,   0.17351969,   -0.039091777,
-       0.08066188,    -0.00561982,   0.12633002,    0.11335965,   -0.0088127935,
-       -0.019777594,  0.06864014,    -0.059751723,  0.016233567,  -0.06894641,
-       -0.28651384,   -0.004228674,  0.019708522,   -0.16305895,  -0.07468996,
-       -0.0855457,    0.099339016,   -0.07580735,   -0.13775392,  0.08434318,
-       0.08330512,    -0.12131499,   0.031935584,   0.09180414,   -0.08876437,
-       -0.08049874,   0.008753825,   0.03498998,    0.030215185,  0.03907079,
-       0.089751154,   0.029194152,   -0.03337423,   -0.019092513, 0.04331237,
-       0.04299654,    -0.036394123,  -0.12915532,   0.09793732,   0.07512415,
-       -0.11319543,   -0.032502122,  0.15661901,    0.07671967,   -0.005491124,
-       -0.19379048,   -0.218606,     0.21448623,    0.017840758,  0.1416943,
-       -0.07051762,   0.19488361,    0.02664691,    -0.18104725,  -0.09334311,
-       0.15026465,    -0.15493552,   -0.057762887,  -0.11604192,  -0.262013,
-       -0.01391798,   0.012185008,   0.11156489,    -0.07483202,  0.06693364,
-       -0.26151478,   0.046425626,   0.036540434,   -0.16435726,  0.17338543,
-       -0.21401681,   -0.11385144,   -0.08283257,   -0.069031075, 0.030635102,
-       0.010969227,   0.11109743,    0.010919218,   0.027526086,  0.13519906,
-       0.01891392,    -0.046839405,  -0.040167913,  0.017953383,  -0.09700955,
-       0.0061885654,  -0.07000971,   0.026893595,   -0.038844477, 0.14543656});
-
-  static float lstm_input[][20] = {
-      {// Batch0: 4 (input_sequence_size) * 5 (n_input)
-       0.787926, 0.151646, 0.071352, 0.118426, 0.458058, 0.596268, 0.998386,
-       0.568695, 0.864524, 0.571277, 0.073204, 0.296072, 0.743333, 0.069199,
-       0.045348, 0.867394, 0.291279, 0.013714, 0.482521, 0.626339},
-
-      {// Batch1: 4 (input_sequence_size) * 5 (n_input)
-       0.295743, 0.544053, 0.690064, 0.858138, 0.497181, 0.642421, 0.524260,
-       0.134799, 0.003639, 0.162482, 0.640394, 0.930399, 0.050782, 0.432485,
-       0.988078, 0.082922, 0.563329, 0.865614, 0.333232, 0.259916}};
-
-  static float lstm_golden_output[][64] = {
-      {// Batch0: 4 (input_sequence_size) * 16 (n_output)
-       -0.00396806, 0.029352,     -0.00279226, 0.0159977,   -0.00835576,
-       -0.0211779,  0.0283512,    -0.0114597,  0.00907307,  -0.0244004,
-       -0.0152191,  -0.0259063,   0.00914318,  0.00415118,  0.017147,
-       0.0134203,   -0.0166936,   0.0381209,   0.000889694, 0.0143363,
-       -0.0328911,  -0.0234288,   0.0333051,   -0.012229,   0.0110322,
-       -0.0457725,  -0.000832209, -0.0202817,  0.0327257,   0.0121308,
-       0.0155969,   0.0312091,    -0.0213783,  0.0350169,   0.000324794,
-       0.0276012,   -0.0263374,   -0.0371449,  0.0446149,   -0.0205474,
-       0.0103729,   -0.0576349,   -0.0150052,  -0.0292043,  0.0376827,
-       0.0136115,   0.0243435,    0.0354492,   -0.0189322,  0.0464512,
-       -0.00251373, 0.0225745,    -0.0308346,  -0.0317124,  0.0460407,
-       -0.0189395,  0.0149363,    -0.0530162,  -0.0150767,  -0.0340193,
-       0.0286833,   0.00824207,   0.0264887,   0.0305169},
-      {// Batch1: 4 (input_sequence_size) * 16 (n_output)
-       -0.013869,    0.0287268,   -0.00334693, 0.00733398,  -0.0287926,
-       -0.0186926,   0.0193662,   -0.0115437,  0.00422612,  -0.0345232,
-       0.00223253,   -0.00957321, 0.0210624,   0.013331,    0.0150954,
-       0.02168,      -0.0141913,  0.0322082,   0.00227024,  0.0260507,
-       -0.0188721,   -0.0296489,  0.0399134,   -0.0160509,  0.0116039,
-       -0.0447318,   -0.0150515,  -0.0277406,  0.0316596,   0.0118233,
-       0.0214762,    0.0293641,   -0.0204549,  0.0450315,   -0.00117378,
-       0.0167673,    -0.0375007,  -0.0238314,  0.038784,    -0.0174034,
-       0.0131743,    -0.0506589,  -0.0048447,  -0.0240239,  0.0325789,
-       0.00790065,   0.0220157,   0.0333314,   -0.0264787,  0.0387855,
-       -0.000764675, 0.0217599,   -0.037537,   -0.0335206,  0.0431679,
-       -0.0211424,   0.010203,    -0.062785,   -0.00832363, -0.025181,
-       0.0412031,    0.0118723,   0.0239643,   0.0394009}};
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
 
   // Resetting cell_state and output_state
   lstm.ResetCellState();
   lstm.ResetOutputState();
 
-  const int input_sequence_size =
-      sizeof(lstm_input[0]) / sizeof(float) / (lstm.num_inputs());
-  for (int i = 0; i < input_sequence_size; i++) {
-    float* batch0_start = lstm_input[0] + i * lstm.num_inputs();
-    float* batch0_end = batch0_start + lstm.num_inputs();
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm);
+}
 
-    lstm.SetInput(0, batch0_start, batch0_end);
+TEST_F(NoCifgPeepholeProjectionClippingLstmTest, HybridLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 20;
+  const int n_output = 16;
 
-    float* batch1_start = lstm_input[1] + i * lstm.num_inputs();
-    float* batch1_end = batch1_start + lstm.num_inputs();
-    lstm.SetInput(lstm.num_inputs(), batch1_start, batch1_end);
+  HybridLSTMOpModel lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/false, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false,
+      /*cell_clip=*/0.0, /*proj_clip=*/0.0,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {n_cell, n_input},  // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {n_cell, n_output},  // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {n_cell},  // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {n_cell},  // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  lstm.SetInputToInputWeights(input_to_input_weights_);
+  lstm.SetInputToCellWeights(input_to_cell_weights_);
+  lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  lstm.SetInputGateBias(input_gate_bias_);
+  lstm.SetCellBias(cell_gate_bias_);
+  lstm.SetForgetGateBias(forget_gate_bias_);
+  lstm.SetOutputGateBias(output_gate_bias_);
+
+  lstm.SetRecurrentToInputWeights(recurrent_to_input_weights_);
+  lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  lstm.SetCellToInputWeights(cell_to_input_weights_);
+  lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  lstm.SetProjectionWeights(projection_weights_);
 
-    lstm.Invoke();
+  // Resetting cell_state and output_state
+  lstm.ResetCellState();
+  lstm.ResetOutputState();
 
-    float* golden_start_batch0 = lstm_golden_output[0] + i * lstm.num_outputs();
-    float* golden_end_batch0 = golden_start_batch0 + lstm.num_outputs();
-    float* golden_start_batch1 = lstm_golden_output[1] + i * lstm.num_outputs();
-    float* golden_end_batch1 = golden_start_batch1 + lstm.num_outputs();
-    std::vector<float> expected;
-    expected.insert(expected.end(), golden_start_batch0, golden_end_batch0);
-    expected.insert(expected.end(), golden_start_batch1, golden_end_batch1);
-    EXPECT_THAT(lstm.GetOutput(), ElementsAreArray(ArrayFloatNear(expected)));
-  }
+  VerifyGoldens(lstm_input_, lstm_golden_output_, &lstm, /*tolerance=*/0.00467);
 }
 
 }  // namespace
-- 
GitLab


From 2b5f598fbd822f911ad305ae1e57325aefd50826 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 5 Jun 2018 12:19:43 -0700
Subject: [PATCH 321/610] Move ReplaceMulWithSquare to a separate optimizer
 stage.

PiperOrigin-RevId: 199338297
---
 .../optimizers/arithmetic_optimizer.cc        | 68 ++++++++++++-------
 .../optimizers/arithmetic_optimizer.h         |  1 +
 .../optimizers/arithmetic_optimizer_test.cc   | 47 +++++++------
 3 files changed, 73 insertions(+), 43 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 400af82627..561930f858 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2079,6 +2079,49 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
   }
 };
 
+// Replace Mul node with identical inputs with a Square.
+class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
+ public:
+  explicit ReplaceMulWithSquare(const GraphOptimizerContext& ctx,
+                                const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ReplaceMulWithSquare", ctx, ctx_ext) {}
+  ~ReplaceMulWithSquare() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsMul(*node) && node->input(0) == node->input(1);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeScopeAndName mul = ParseNodeScopeAndName(node->name());
+    const string optimized_node_name = OptimizedNodeName(mul);
+    if (ctx().node_map->NodeExists(optimized_node_name)) return Status::OK();
+
+    const DataType type = GetDataTypeFromAttr(*node, "T");
+    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+
+    string task;
+    string device;
+    bool is_on_cpu =
+        DeviceNameUtils::SplitDeviceName(node->device(), &task, &device) &&
+        str_util::StrContains(device, DEVICE_CPU);
+
+    if (!is_complex || is_on_cpu) {
+      NodeDef* new_square_node = AddCopyNode(optimized_node_name, node);
+      new_square_node->set_op("Square");
+      for (int i = 1; i < new_square_node->input_size(); ++i) {
+        new_square_node->set_input(i - 1, new_square_node->input(i));
+      }
+      new_square_node->mutable_input()->RemoveLast();
+      for (const string& input : new_square_node->input()) {
+        ctx().node_map->AddOutput(NodeName(input), new_square_node->name());
+      }
+      *simplified_node_name = new_square_node->name();
+    }
+
+    return Status::OK();
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2331,29 +2374,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
 // ArithmeticOptimizerStage
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-  if (node->op() == "Mul" && node->input(0) == node->input(1) &&
-      !OptimizedNodeExists(*node, "square")) {
-    const DataType type = GetDataTypeFromAttr(*node, "T");
-    bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
-    string dontcare;
-    string device;
-    bool is_on_cpu =
-        DeviceNameUtils::SplitDeviceName(node->device(), &dontcare, &device) &&
-        str_util::StrContains(device, DEVICE_CPU);
-    if (!is_complex || is_on_cpu) {
-      NodeDef* new_square_node = AddNode(*node, "square", /*copy_node=*/true);
-      new_square_node->set_op("Square");
-      for (int i = 1; i < new_square_node->input_size(); ++i) {
-        new_square_node->set_input(i - 1, new_square_node->input(i));
-      }
-      new_square_node->mutable_input()->RemoveLast();
-      for (const string& input : new_square_node->input()) {
-        node_map_->AddOutput(NodeName(input), new_square_node->name());
-      }
-      return new_square_node->name();
-    }
-  }
-
   if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
     // Discard aggregate nodes with a single input and no control dependencies.
     if (node->input_size() == 1) {
@@ -2528,6 +2548,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<RemoveRedundantReshape>(ctx, ctx_ext);
   if (options_.remove_negation)
     pipeline.AddStage<RemoveNegationStage>(ctx, ctx_ext);
+  if (options_.replace_mul_with_square)
+    pipeline.AddStage<ReplaceMulWithSquare>(ctx, ctx_ext);
   if (options_.remove_logical_not)
     pipeline.AddStage<RemoveLogicalNotStage>(ctx, ctx_ext);
   if (options_.reorder_cast_and_transpose)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index e6fc311929..8e00b83a70 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -74,6 +74,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_cast = true;
     bool remove_redundant_reshape = true;
     bool reorder_cast_and_transpose = true;
+    bool replace_mul_with_square = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index b9fec0f860..f15cbfe407 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -139,6 +139,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.remove_negation = false;
     options.remove_logical_not = false;
     options.reorder_cast_and_transpose = false;
+    options.replace_mul_with_square = false;
     optimizer->options_ = options;
   }
 
@@ -201,6 +202,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.reorder_cast_and_transpose = true;
   }
 
+  void EnableOnlyReplaceMulWithSquare(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.replace_mul_with_square = true;
+  }
+
   void EnableOnlyHoistCWiseUnaryChains(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.hoist_cwise_unary_chains = true;
@@ -345,33 +351,36 @@ TEST_F(ArithmeticOptimizerTest, OpDedupCommutative) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
-TEST_F(ArithmeticOptimizerTest, MulToSquare) {
+TEST_F(ArithmeticOptimizerTest, ReplaceMulWithSquare) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output c = ops::Const(s.WithOpName("c"), {1.0f, 2.0f}, {1, 2});
   Output d = ops::Const(s.WithOpName("d"), {3.0f, 4.0f}, {1, 2});
   Output mul = ops::Mul(s.WithControlDependencies(d).WithOpName("mul"), c, c);
   Output id = ops::Identity(s.WithOpName("id"), mul);
+
   GrapplerItem item;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"id"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
-  ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  ArithmeticOptimizer optimizer;
+  EnableOnlyReplaceMulWithSquare(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
 
-  EXPECT_EQ(5, output.node_size());
-  EXPECT_EQ("id", output.node(3).name());
-  EXPECT_EQ(OptimizedName("mul_square"), output.node(3).input(0));
-  EXPECT_EQ("Square", output.node(4).op());
-  EXPECT_EQ(OptimizedName("mul_square"), output.node(4).name());
-  EXPECT_EQ(2, output.node(4).input_size());
-  EXPECT_EQ("c", output.node(4).input(0));
-  EXPECT_EQ("^d", output.node(4).input(1));
+  EXPECT_EQ(4, output.node_size());
 
-  auto tensors = EvaluateNodes(output, fetch);
+  NodeMap node_map(&output);
+  const string p = "ArithmeticOptimizer/ReplaceMulWithSquare";
+  const NodeDef* square_node = node_map.GetNode(strings::StrCat(p, "_", "mul"));
+
+  ASSERT_NE(square_node, nullptr);
+  EXPECT_EQ("Square", square_node->op());
+  EXPECT_EQ("c", square_node->input(0));
+  EXPECT_EQ("^d", square_node->input(1));
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
@@ -386,12 +395,10 @@ TEST_F(ArithmeticOptimizerTest, RemoveInvolution_AdjacentNodes) {
   auto recip2 = ops::Reciprocal(s.WithOpName("recip2"), recip1);
   auto id = ops::Identity(s.WithOpName("id"), recip2);
 
-  std::vector<string> fetch = {"id"};
-
   GrapplerItem item;
-  item.fetch = fetch;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   GraphDef output;
@@ -404,7 +411,7 @@ TEST_F(ArithmeticOptimizerTest, RemoveInvolution_AdjacentNodes) {
   EXPECT_EQ("id", output.node(1).name());
   EXPECT_EQ("c", output.node(1).input(0));
 
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
-- 
GitLab


From a1e258706972fb8c686434163b4f939010deab34 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 12:32:18 -0700
Subject: [PATCH 322/610] Fixing typo in Subtract Kernel.

PiperOrigin-RevId: 199340127
---
 tensorflow/contrib/lite/kernels/sub.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/sub.cc b/tensorflow/contrib/lite/kernels/sub.cc
index d788159a8d..bdcaab8e2f 100644
--- a/tensorflow/contrib/lite/kernels/sub.cc
+++ b/tensorflow/contrib/lite/kernels/sub.cc
@@ -175,7 +175,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                                output);
   } else {
     context->ReportError(
-        context, "output type %d is not support, requires float|uint8 types.",
+        context, "output type %d is not supported, requires float|uint8 types.",
         output->type);
     return kTfLiteError;
   }
-- 
GitLab


From 397f04acb1faeff451691d7fdc0f754eeb547cc1 Mon Sep 17 00:00:00 2001
From: Pete Warden <pete@petewarden.com>
Date: Tue, 5 Jun 2018 12:41:22 -0700
Subject: [PATCH 323/610] Fix for Raspberry Pi build breakage (#19782)

---
 tensorflow/contrib/lite/toco/toco_port.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/toco_port.cc b/tensorflow/contrib/lite/toco/toco_port.cc
index 49a3302caf..3a5911c28d 100644
--- a/tensorflow/contrib/lite/toco/toco_port.cc
+++ b/tensorflow/contrib/lite/toco/toco_port.cc
@@ -18,12 +18,10 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/toco_types.h"
 #include "tensorflow/core/platform/logging.h"
 
-#ifdef __ARM_ARCH_7A__
+#if defined(__ANDROID__) && defined(__ARM_ARCH_7A__)
 namespace std {
-double round(double x) {
-  return ::round(x);
-}
-}
+double round(double x) { return ::round(x); }
+}  // namespace std
 #endif
 
 namespace toco {
-- 
GitLab


From b7928ac78d3cd688967bcf4e5253e384b355070f Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Tue, 5 Jun 2018 12:42:44 -0700
Subject: [PATCH 324/610] Clarifies how to pass training hooks to TPUEstimator
 in the docstring for TPUEstimator.

PiperOrigin-RevId: 199341721
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 83 ++++++++++++++-----
 1 file changed, 64 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index f63e9e8bda..64ae35dfc5 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -122,6 +122,33 @@ def _create_global_step(graph):
 
 
 def _create_or_get_iterations_per_loop():
+  """Creates or gets the iterations_per_loop variable.
+
+  In TPUEstimator, the user provided computation, the model_fn, is wrapped
+  inside a tf.while_loop for peak performance. The iterations of the loop are
+  specified by this variable, which adjusts its value on the CPU after each TPU
+  program execution and before the next TPU execution.
+
+  The purpose of using a variable, rather then a constant, is to allow
+  TPUEstimator adapt the TPU training iterations according to the final steps
+  specified by users. For example, if the user sets the iterations_per_loop as 4
+  in TPUConfig and steps as 10 in TPUEstimator.train(), the iterations_per_loop
+  variable will have the following value before each TPU training.
+
+      - 1-th TPU execution: iterations_per_loop = 4
+      - 2-th TPU execution: iterations_per_loop = 4
+      - 3-th TPU execution: iterations_per_loop = 2
+
+  As model_fn increases the global step once per train_op invocation, the global
+  step is 10 after all TPU executions, matching the steps=10 inputs passed in by
+  users.
+
+  Returns:
+    A TF non-trainable resource variable.
+
+  Raises:
+    RuntimeError: If multi iterations_per_loop variables were found.
+  """
   graph = ops.get_default_graph()
   collection_name = '{}_{}'.format(_TPU_ESTIMATOR, _ITERATIONS_PER_LOOP_VAR)
   iter_vars = graph.get_collection(collection_name)
@@ -388,20 +415,21 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
       return
 
     def _cancel_session():
-      # Close the session to avoid the main thread from hanging. If input
-      # pipeline triggers any error, the infeed thread dies but the main thread
-      # for TPU computation waits for the infeed enqueue forever. Close the
-      # Session to cancel the main thread Session.run execution.
-      #
-      # We sleep for a few seconds before closing to give some time
-      # for the TPU compilation error, if any, propagating, from TPU to CPU
-      # host. Compilation errors should be reported by the main thread so that
-      # the program can be interrupted and users can take action.  Due to a race
-      # condition, the infeed thread might see an error first.  Closing the
-      # session here immediately would result in a session cancellation
-      # exception in the main thread, instead of the expected compile error.
-      # User code that depends on having the proper exception type will
-      # therefore be confused.
+      """Close the session to avoid the main thread from hanging.
+
+      If input pipeline triggers any error, the infeed thread dies but the main
+      thread for TPU computation waits for the infeed enqueue forever. Close the
+      Session to cancel the main thread Session.run execution.
+
+      We sleep for a few seconds before closing to give some time for the TPU
+      compilation error, if any, propagating, from TPU to CPU host. Compilation
+      errors should be reported by the main thread so that the program can be
+      interrupted and users can take action.  Due to a race condition, the
+      infeed thread might see an error first.  Closing the session here
+      immediately would result in a session cancellation exception in the main
+      thread, instead of the expected compile error.  User code that depends on
+      having the proper exception type will therefore be confused.
+      """
       time.sleep(5)
 
       # If the main session is still running, the infeed/outfeed errors are
@@ -721,6 +749,15 @@ def generate_per_host_enqueue_ops_fn_for_host(
     tpu_ordinal_function = None
 
   def enqueue_ops_fn():
+    """A Fn returning the TPU infeed enqueue ops.
+
+    By providing as a Fn, it can be invoked inside the tf.while_loop such that
+    the input pipeline for multiple iterations can be executed by one
+    Session.run call.
+
+    Returns:
+      list of dict of ops.
+    """
     with ops.device(device):
       num_of_replicas_per_host = ctx.num_of_replicas_per_host
       # Convert user input to features and labels.  If the user returns a
@@ -1095,10 +1132,16 @@ class _InputPipeline(object):
     return enqueue_ops, all_hooks, run_infeed_loop_on_coordinator
 
   def _validate_input_pipeline(self):
-    # Perform some sanity checks to log user friendly information. We should
-    # error out to give users better error message. But, if
-    # _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
-    # user code, so, log a warning.
+    """Validates the input pipeline.
+
+    Perform some sanity checks to log user friendly information. We should
+    error out to give users better error message. But, if
+    _WRAP_INPUT_FN_INTO_WHILE_LOOP is False (legacy behavior), we cannot break
+    user code, so, log a warning.
+
+    Raises:
+      RuntimeError: If the validation failed.
+    """
     if ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS):
       err_msg = ('Input pipeline contains one or more QueueRunners. '
                  'It could be slow and not scalable. Please consider '
@@ -1837,7 +1880,8 @@ class TPUEstimator(estimator_lib.Estimator):
     Args:
       model_fn: Model function as required by `Estimator`. For training, the
         returned `EstimatorSpec` cannot have hooks as it is not supported in
-        `TPUEstimator`.
+        `TPUEstimator`. Instead, the user can pass the training hooks as
+        an argument to `TPUEstimator.train()`.
       model_dir: Directory to save model parameters, graph and etc. This can
         also be used to load checkpoints from the directory into a estimator to
         continue training a previously saved model. If `None`, the model_dir in
@@ -2898,6 +2942,7 @@ class _StopSignals(object):
 
   @staticmethod
   def should_stop(scalar_stopping_signal):
+    """Detects whether scalar_stopping_signal indicates stopping."""
     if isinstance(scalar_stopping_signal, ops.Tensor):
       # STOPPING_SIGNAL is a constant True. Here, the logical_and is just the TF
       # way to express the bool check whether scalar_stopping_signal is True.
-- 
GitLab


From c681be04ec15cdfc225bc61132420781bf23d298 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 5 Jun 2018 13:12:02 -0700
Subject: [PATCH 325/610] Move SimplifyAggregation to separate aggregation
 stage.

PiperOrigin-RevId: 199346067
---
 .../optimizers/arithmetic_optimizer.cc        | 171 +++++++++++-------
 .../optimizers/arithmetic_optimizer.h         |   1 +
 .../optimizers/arithmetic_optimizer_test.cc   |  68 +++++--
 3 files changed, 154 insertions(+), 86 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 561930f858..2408652c87 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2122,6 +2122,109 @@ class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
   }
 };
 
+// Simplify aggregation (e.g. AddN) nodes:
+//
+// 1. Discard aggregate nodes with a single input and no control dependencies.
+//
+// 2. Try to rewrite aggregations of N >= 2 identical terms (possibly due to
+//    deduping or other rewrites) so we can get rid of the sum entirely.
+//
+//    The expression (using AddN as an example of an aggregate op):
+//      AddN(x, x, x, ... ,x)
+//           <-- N terms -->
+//    can be rewritten to:
+//      Mul(Const(N), x))
+//
+class SimplifyAggregation : public ArithmeticOptimizerStage {
+ public:
+  explicit SimplifyAggregation(const GraphOptimizerContext& ctx,
+                               const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("SimplifyAggregation", ctx, ctx_ext) {}
+  ~SimplifyAggregation() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 0;
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    // 1. Discard aggregate nodes with a single input and no control deps.
+    if (node->input_size() == 1) {
+      *simplified_node_name = node->input(0);
+      return Status::OK();
+    }
+
+    // 2. Rewrite aggregations of N >= 2 identical terms.
+
+    // All non-control inputs must be identical.
+    bool all_equal = true;
+    int num_inputs = 1;
+    for (int i = 1; i < node->input_size(); ++i) {
+      if (IsControlInput(node->input(i))) break;
+      ++num_inputs;
+      if (node->input(i) != node->input(0)) {
+        all_equal = false;
+        break;
+      }
+    }
+    if (!all_equal) return Status::OK();
+
+    // And node should not be optimized earlier.
+    const NodeScopeAndName node_scope_and_name =
+        ParseNodeScopeAndName(node->name());
+    const string optimized_const_name =
+        OptimizedNodeName(node_scope_and_name, "Const");
+    const string optimized_mul_name =
+        OptimizedNodeName(node_scope_and_name, "Mul");
+
+    bool is_already_optimized =
+        ctx().node_map->NodeExists(optimized_const_name) ||
+        ctx().node_map->NodeExists(optimized_mul_name);
+
+    if (is_already_optimized) return Status::OK();
+
+    // At this point all preconditions are met, and we safely do the rewrite.
+    VLOG(3) << "Simplify aggregation with identical inputs: node="
+            << node->name() << " num_inputs=" << num_inputs;
+
+    // 1. Create constant node with value N.
+    const auto type = GetDataTypeFromAttr(*node, "T");
+    Tensor t(type, TensorShape({}));
+    Status status = SetTensorValue(type, num_inputs, &t);
+    if (!status.ok()) {
+      return errors::Internal("Failed to create const node: ",
+                              status.error_message());
+    }
+
+    TensorValue value(&t);
+    NodeDef* new_const_node = AddEmptyNode(optimized_const_name);
+    status = ConstantFolding::CreateNodeDef(new_const_node->name(), value,
+                                            new_const_node);
+    if (!status.ok()) {
+      return errors::Internal("Failed to create const node: ",
+                              status.error_message());
+    }
+    new_const_node->set_device(node->device());
+    MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
+                         ctx().optimized_graph, ctx().node_map);
+    AddToOptimizationQueue(new_const_node);
+
+    // 2. Replace the aggregate node with Mul(Const(N), x).
+    NodeDef* new_mul_node = AddEmptyNode(optimized_mul_name);
+    new_mul_node->set_op("Mul");
+    new_mul_node->set_device(node->device());
+    SetDataTypeToAttr(type, "T", new_mul_node);
+    new_mul_node->add_input(new_const_node->name());
+    ctx().node_map->AddOutput(new_const_node->name(), new_mul_node->name());
+    new_mul_node->add_input(node->input(0));
+    ctx().node_map->AddOutput(node->input(0), new_mul_node->name());
+
+    ForwardControlDependencies(new_mul_node, {node});
+    *simplified_node_name = new_mul_node->name();
+
+    return Status::OK();
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2374,72 +2477,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
 // ArithmeticOptimizerStage
 string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
     const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-  if (IsAggregate(*node) && NumNonControlInputs(*node) > 0) {
-    // Discard aggregate nodes with a single input and no control dependencies.
-    if (node->input_size() == 1) {
-      return node->input(0);
-    }
-
-    // Try to rewrite aggregations of N >= 2 identical terms (possibly due
-    // to deduping or other rewrites) so we can get rid of the sum entirely.
-    // The expression (using AddN as an example of an aggregate op):
-    //   AddN(x, x, x, ... ,x)
-    //        <-- N terms -->
-    // can be rewritten to
-    //   Mul(Const(N), x))
-    //
-    bool all_equal = true;
-    int num_inputs = 1;
-    for (int i = 1; i < node->input_size(); ++i) {
-      if (IsControlInput(node->input(i))) {
-        break;
-      }
-      ++num_inputs;
-      if (node->input(i) != node->input(0)) {
-        all_equal = false;
-        break;
-      }
-    }
-    if (all_equal && !OptimizedNodeExists(*node, "const") &&
-        !OptimizedNodeExists(*node, "mul")) {
-      // 1. Create constant node with value N.
-      const auto type = GetDataTypeFromAttr(*node, "T");
-      Tensor t(type, TensorShape({}));
-      Status status = SetTensorValue(type, num_inputs, &t);
-      if (!status.ok()) {
-        LOG(WARNING) << "Failed to create const node: "
-                     << status.error_message();
-        return "";
-      }
-      TensorValue value(&t);
-      NodeDef* new_const_node = AddNode(*node, "const", /*copy_node=*/false);
-      status = ConstantFolding::CreateNodeDef(new_const_node->name(), value,
-                                              new_const_node);
-      if (!status.ok()) {
-        LOG(WARNING) << "Failed to create const node: "
-                     << status.error_message();
-        return "";
-      }
-      new_const_node->set_device(node->device());
-      MaybeAddControlInput(NodeName(node->input(0)), new_const_node,
-                           optimized_graph_, node_map_.get());
-      nodes_to_simplify->PushBack(new_const_node);
-
-      // 2. Replace the aggregate node with Mul(Const(N), x).
-      NodeDef* new_mul_node = AddNode(*node, "mul", /*copy_node=*/false);
-      new_mul_node->set_op("Mul");
-      new_mul_node->set_device(node->device());
-      SetDataTypeToAttr(type, "T", new_mul_node);
-      new_mul_node->add_input(new_const_node->name());
-      node_map_->AddOutput(new_const_node->name(), new_mul_node->name());
-      new_mul_node->add_input(node->input(0));
-      node_map_->AddOutput(node->input(0), new_mul_node->name());
-
-      ForwardControlDependencies(new_mul_node, {node});
-      return new_mul_node->name();
-    }
-  }
-
   // Fold Transpose into matrix multiplication.
   if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
        node->op() == "BatchMatMul") &&
@@ -2554,6 +2591,8 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<RemoveLogicalNotStage>(ctx, ctx_ext);
   if (options_.reorder_cast_and_transpose)
     pipeline.AddStage<ReorderCastAndTranspose>(ctx, ctx_ext);
+  if (options_.simplify_aggregation)
+    pipeline.AddStage<SimplifyAggregation>(ctx, ctx_ext);
   if (options_.hoist_cwise_unary_chains)
     pipeline.AddStage<HoistCWiseUnaryChainsStage>(ctx, ctx_ext);
   if (options_.convert_sqrt_div_to_rsqrt_mul)
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 8e00b83a70..549ea3fde5 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -75,6 +75,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool remove_redundant_reshape = true;
     bool reorder_cast_and_transpose = true;
     bool replace_mul_with_square = true;
+    bool simplify_aggregation = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index f15cbfe407..f79347cde6 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -40,21 +40,37 @@ constexpr char kHoistFactorOptimizerMul[] =
 constexpr char kHoistFactorOptimizerAdd[] =
     "ArithmeticOptimizer/HoistCommonFactor_Add_";
 
-// Optimized name of outer Mul node by HoistCommonFactorOutOfAggregation
+constexpr char kSimplifyAggregationConst[] =
+    "ArithmeticOptimizer/SimplifyAggregation_Const_";
+
+constexpr char kSimplifyAggregationMul[] =
+    "ArithmeticOptimizer/SimplifyAggregation_Mul_";
+
+// Optimized name of outer Mul node by HoistCommonFactorOutOfAggregation.
 string HoistMulName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerMul, "");
 }
 
-// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation
+// Optimized name of outer Div node by HoistCommonFactorOutOfAggregation.
 string HoistDivName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerDiv, "");
 }
 
-// Optimized name of inner Add node by HoistCommonFactorOutOfAggregation
+// Optimized name of inner Add node by HoistCommonFactorOutOfAggregation.
 string HoistAddName(const string& name) {
   return AddPrefixToNodeName(name, kHoistFactorOptimizerAdd, "");
 }
 
+// Optimized name of Const node by SimplifyAggregation.
+string AggregationConstName(const string& name) {
+  return AddPrefixToNodeName(name, kSimplifyAggregationConst, "");
+}
+
+// Optimized name of Mul node by SimplifyAggregation.
+string AggregationMulName(const string& name) {
+  return AddPrefixToNodeName(name, kSimplifyAggregationMul, "");
+}
+
 string OptimizedName(const string& name) {
   return AddPrefixToNodeName(name, kArithmeticOptimizer);
 }
@@ -140,6 +156,7 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     options.remove_logical_not = false;
     options.reorder_cast_and_transpose = false;
     options.replace_mul_with_square = false;
+    options.simplify_aggregation = false;
     optimizer->options_ = options;
   }
 
@@ -226,6 +243,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     DisableAllStages(optimizer);
     optimizer->options_.remove_logical_not = true;
   }
+
+  void EnableOnlySimplifyAggregation(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.simplify_aggregation = true;
+  }
 };
 
 TEST_F(ArithmeticOptimizerTest, NoOp) {
@@ -500,10 +522,10 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
   Output id = ops::Identity(s.WithOpName("id"), add);
 
   GrapplerItem item;
+  item.fetch = {"id"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  std::vector<string> fetch = {"id"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -513,22 +535,25 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimple) {
 
   EXPECT_EQ(5, output.node_size());
 
-  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  const string optimized_const_name = AggregationConstName("add");
+  const string optimized_mul_name = AggregationMulName("add");
+
+  const NodeDef* new_const = node_map.GetNode(optimized_const_name);
   ASSERT_NE(new_const, nullptr);
   EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
             new_const->attr().at("value").tensor().tensor_content());
 
-  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  const NodeDef* new_mul = node_map.GetNode(optimized_mul_name);
   ASSERT_NE(new_mul, nullptr);
-  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ(optimized_const_name, new_mul->input(0));
   EXPECT_EQ("x", new_mul->input(1));
 
   const NodeDef* new_id = node_map.GetNode("id");
   ASSERT_NE(new_id, nullptr);
-  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+  EXPECT_EQ(optimized_mul_name, new_id->input(0));
 
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
@@ -554,21 +579,24 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsSimpleWithControlDep) {
 
   EXPECT_EQ(6, output.node_size());
 
-  const NodeDef* new_const = node_map.GetNode(OptimizedName("add_const"));
+  const string optimized_const_name = AggregationConstName("add");
+  const string optimized_mul_name = AggregationMulName("add");
+
+  const NodeDef* new_const = node_map.GetNode(optimized_const_name);
   ASSERT_NE(new_const, nullptr);
   EXPECT_EQ("^x", new_const->input(0));
   EXPECT_EQ(std::string("\0\0\0@", 4),
             new_const->attr().at("value").tensor().tensor_content());
 
-  const NodeDef* new_mul = node_map.GetNode(OptimizedName("add_mul"));
+  const NodeDef* new_mul = node_map.GetNode(optimized_mul_name);
   ASSERT_NE(new_mul, nullptr);
-  EXPECT_EQ(OptimizedName("add_const"), new_mul->input(0));
+  EXPECT_EQ(optimized_const_name, new_mul->input(0));
   EXPECT_EQ("x", new_mul->input(1));
   EXPECT_EQ("^y", new_mul->input(2));
 
   const NodeDef* new_id = node_map.GetNode("id");
   ASSERT_NE(new_id, nullptr);
-  EXPECT_EQ(OptimizedName("add_mul"), new_id->input(0));
+  EXPECT_EQ(optimized_mul_name, new_id->input(0));
 
   auto tensors = EvaluateNodes(output, fetch);
   EXPECT_EQ(1, tensors.size());
@@ -633,24 +661,24 @@ TEST_F(ArithmeticOptimizerTest, TrivialSumsRepeatedAdd) {
   ASSERT_NE(add_4_node, nullptr);
   EXPECT_EQ("Add", add_4_node->op());
   EXPECT_EQ(2, add_4_node->input_size());
-  EXPECT_EQ(OptimizedName("Add_const"), add_4_node->input(0));
-  EXPECT_EQ(OptimizedName("Add_1_const"), add_4_node->input(1));
+  EXPECT_EQ(AggregationConstName("Add"), add_4_node->input(0));
+  EXPECT_EQ(AggregationConstName("Add_1"), add_4_node->input(1));
 
   const NodeDef* add_5_node = node_map.GetNode(HoistAddName("Add_5"));
   ASSERT_NE(add_5_node, nullptr);
   EXPECT_EQ("Add", add_5_node->op());
   EXPECT_EQ(2, add_5_node->input_size());
-  EXPECT_EQ(OptimizedName("Add_const"), add_5_node->input(0));
-  EXPECT_EQ(OptimizedName("Add_1_const"), add_5_node->input(1));
+  EXPECT_EQ(AggregationConstName("Add"), add_5_node->input(0));
+  EXPECT_EQ(AggregationConstName("Add_1"), add_5_node->input(1));
 
-  const NodeDef* add_const_node = node_map.GetNode(OptimizedName("Add_const"));
+  const NodeDef* add_const_node = node_map.GetNode(AggregationConstName("Add"));
   ASSERT_NE(add_const_node, nullptr);
   EXPECT_EQ("Const", add_const_node->op());
   EXPECT_EQ(1, add_const_node->input_size());
   EXPECT_EQ("^Placeholder", add_const_node->input(0));
 
   const NodeDef* add_1_const_node =
-      node_map.GetNode(OptimizedName("Add_1_const"));
+      node_map.GetNode(AggregationConstName("Add_1"));
   ASSERT_NE(add_1_const_node, nullptr);
   EXPECT_EQ("Const", add_1_const_node->op());
   EXPECT_EQ(1, add_1_const_node->input_size());
-- 
GitLab


From 1bac6186e19353d9881584ce8ec51bf35d627842 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 5 Jun 2018 13:16:57 -0700
Subject: [PATCH 326/610] Introduce tf.contrib.control_flow.new_cond.

new_cond is a new implementation of tf.cond. Instead of emitting
control flow ops (i.e. Switch and Merge nodes), new_cond emits a
single If op, which represents the conditional branches as TF
functions.

With this change, users can use new_cond and take its gradient.

The idea is for new_cond to eventually replace tf.cond. There are
several functional and performance gaps that must be addressed first,
including:
* Gradients won't work on imported graphs
* Misc. limitations of TF functions (lack of collections, device scopes, etc.)
PiperOrigin-RevId: 199346735
---
 tensorflow/contrib/BUILD                      |   5 +-
 tensorflow/contrib/__init__.py                |   1 +
 tensorflow/contrib/cmake/python_modules.txt   |   2 +
 tensorflow/contrib/control_flow/BUILD         |  48 +++
 tensorflow/contrib/control_flow/__init__.py   |  31 ++
 .../contrib/control_flow/python/cond_v2.py    | 394 ++++++++++++++++++
 .../control_flow/python/cond_v2_test.py       | 113 +++++
 .../api_def/base_api/api_def_FakeParam.pbtxt  |  24 ++
 .../python_api/api_def_FakeParam.pbtxt        |   4 +
 tensorflow/core/kernels/functional_ops.cc     |  19 +
 tensorflow/core/ops/functional_ops.cc         |  17 +
 tensorflow/python/BUILD                       |   5 +-
 12 files changed, 660 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/contrib/control_flow/BUILD
 create mode 100644 tensorflow/contrib/control_flow/__init__.py
 create mode 100644 tensorflow/contrib/control_flow/python/cond_v2.py
 create mode 100644 tensorflow/contrib/control_flow/python/cond_v2_test.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt

diff --git a/tensorflow/contrib/BUILD b/tensorflow/contrib/BUILD
index 0f9c80404a..50b1ae5cc3 100644
--- a/tensorflow/contrib/BUILD
+++ b/tensorflow/contrib/BUILD
@@ -31,13 +31,15 @@ py_library(
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/coder:coder_py",
         "//tensorflow/contrib/compiler:compiler_py",
+        "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/constrained_optimization",
+        "//tensorflow/contrib/control_flow",
         "//tensorflow/contrib/copy_graph:copy_graph_py",
         "//tensorflow/contrib/crf:crf_py",
         "//tensorflow/contrib/cudnn_rnn:cudnn_rnn_py",
         "//tensorflow/contrib/data",
-        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/deprecated:deprecated_py",
+        "//tensorflow/contrib/distribute:distribute",
         "//tensorflow/contrib/distributions:distributions_py",
         "//tensorflow/contrib/eager/python:tfe",
         "//tensorflow/contrib/estimator:estimator_py",
@@ -83,7 +85,6 @@ py_library(
         "//tensorflow/contrib/proto",
         "//tensorflow/contrib/quantization:quantization_py",
         "//tensorflow/contrib/quantize:quantize_graph",
-        "//tensorflow/contrib/autograph",
         "//tensorflow/contrib/receptive_field:receptive_field_py",
         "//tensorflow/contrib/recurrent:recurrent_py",
         "//tensorflow/contrib/reduce_slice_ops:reduce_slice_ops_py",
diff --git a/tensorflow/contrib/__init__.py b/tensorflow/contrib/__init__.py
index 9aad772f0a..ad8c40395c 100644
--- a/tensorflow/contrib/__init__.py
+++ b/tensorflow/contrib/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.contrib import cluster_resolver
 from tensorflow.contrib import coder
 from tensorflow.contrib import compiler
 from tensorflow.contrib import constrained_optimization
+from tensorflow.contrib import control_flow
 from tensorflow.contrib import copy_graph
 from tensorflow.contrib import crf
 from tensorflow.contrib import cudnn_rnn
diff --git a/tensorflow/contrib/cmake/python_modules.txt b/tensorflow/contrib/cmake/python_modules.txt
index fece56c412..015cb73bbd 100644
--- a/tensorflow/contrib/cmake/python_modules.txt
+++ b/tensorflow/contrib/cmake/python_modules.txt
@@ -115,6 +115,8 @@ tensorflow/contrib/coder/python/ops
 tensorflow/contrib/compiler
 tensorflow/contrib/constrained_optimization
 tensorflow/contrib/constrained_optimization/python
+tensorflow/contrib/control_flow
+tensorflow/contrib/control_flow/python
 tensorflow/contrib/copy_graph
 tensorflow/contrib/copy_graph/python
 tensorflow/contrib/copy_graph/python/util
diff --git a/tensorflow/contrib/control_flow/BUILD b/tensorflow/contrib/control_flow/BUILD
new file mode 100644
index 0000000000..746b5b5b5e
--- /dev/null
+++ b/tensorflow/contrib/control_flow/BUILD
@@ -0,0 +1,48 @@
+# New implementations of control flow ops
+
+licenses(["notice"])  # Apache 2.0
+
+package(default_visibility = ["//visibility:public"])
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+py_library(
+    name = "control_flow",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cond_v2",
+    ],
+)
+
+py_library(
+    name = "cond_v2",
+    srcs = ["python/cond_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:c_api_util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops_gen",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:pywrap_tensorflow",
+    ],
+)
+
+tf_py_test(
+    name = "cond_v2_test",
+    size = "small",
+    srcs = ["python/cond_v2_test.py"],
+    additional_deps = [
+        ":cond_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:gradients",
+    ],
+    grpc_enabled = True,
+)
diff --git a/tensorflow/contrib/control_flow/__init__.py b/tensorflow/contrib/control_flow/__init__.py
new file mode 100644
index 0000000000..582af2cf10
--- /dev/null
+++ b/tensorflow/contrib/control_flow/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""New implementations of TF control flow ops.
+
+@@cond_v2
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.contrib.control_flow.python.cond_v2 import cond_v2
+# pylint: enable=unused-import
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+remove_undocumented(__name__)
diff --git a/tensorflow/contrib/control_flow/python/cond_v2.py b/tensorflow/contrib/control_flow/python/cond_v2.py
new file mode 100644
index 0000000000..90c678d0f6
--- /dev/null
+++ b/tensorflow/contrib/control_flow/python/cond_v2.py
@@ -0,0 +1,394 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""cond_v2 and gradient.
+
+This is a version of cond that emits a single If op, as well as the gradient
+function for If ops produced by cond_v2. This will eventually replace the
+current tf.cond implementation once it reaches feature and performance parity.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python.framework import c_api_util
+from tensorflow.python.framework import function
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gradients_impl
+
+
+# NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
+# that they aren't part of the official public API. These protected members
+# often need to be used by implementation code however. Rather than litter the
+# code with pylint comments, we ignore protected access violations for
+# readability.
+# pylint: disable=protected-access
+
+
+def cond_v2(pred, true_fn, false_fn, name="cond"):
+  """Like tf.cond, except emits a single If op."""
+  with ops.name_scope(name) as scope:
+    true_graph = function.func_graph_from_py_func(true_fn, [], [],
+                                                  name="%s_true" % scope)
+    false_graph = function.func_graph_from_py_func(false_fn, [], [],
+                                                   name="%s_false" % scope)
+    _check_same_outputs(true_graph, false_graph)
+
+    # Add inputs to true_graph and false_graph to make them match. Note that
+    # this modifies true_graph and false_graph.
+    cond_inputs = _make_inputs_match(true_graph, false_graph,
+                                     true_graph.extra_inputs,
+                                     false_graph.extra_inputs)
+
+    # Add all intermediate tensors as function outputs so they're available for
+    # the gradient computation.
+
+    true_intermediates = _get_intermediates(true_graph)
+    false_intermediates = _get_intermediates(false_graph)
+
+    # Save the original number of outputs to return to the caller.
+    num_cond_outputs = len(true_graph.outputs)
+
+    # Make the number/type of new intermediate outputs match.
+    extra_true_outputs, extra_false_outputs = _pad_params(
+        true_graph, false_graph, true_intermediates, false_intermediates)
+
+    true_graph.outputs.extend(extra_true_outputs)
+    false_graph.outputs.extend(extra_false_outputs)
+
+    # Create the If op.
+    tensors = gen_functional_ops._if(
+        pred, cond_inputs, [t.dtype for t in true_graph.outputs],
+        _create_new_tf_function(true_graph),
+        _create_new_tf_function(false_graph),
+        name=scope)
+
+    # TODO(b/79883549): if we could make Graphs from FunctionDefs, we wouldn't
+    # need this extra state. Requiring extra state also prevents the ability to
+    # take the gradient of deserialized If ops.
+    tensors[0].op._true_graph = true_graph
+    tensors[0].op._false_graph = false_graph
+
+    return tensors[:num_cond_outputs]
+
+
+@ops.RegisterGradient("If")
+def _IfGrad(op, *grads):  # pylint: disable=invalid-name
+  """The gradient of an If op produced by cond_v2."""
+  true_graph = op._true_graph
+  false_graph = op._false_graph
+
+  # Create grad functions that compute the gradient of the true/false forward
+  # graphs. These functions will capture tensors from the forward pass
+  # functions.
+  true_grad_graph = _create_grad_func(
+      true_graph, grads, "%sgrad" % true_graph.name)
+  false_grad_graph = _create_grad_func(
+      false_graph, grads, "%sgrad" % false_graph.name)
+
+  assert ([t.dtype for t in true_grad_graph.outputs] ==
+          [t.dtype for t in false_grad_graph.outputs])
+
+  # Match up the captured grad function inputs with outputs of 'op' and other
+  # external tensors.
+  true_grad_inputs = _get_grad_inputs(op, true_graph, true_grad_graph)
+  false_grad_inputs = _get_grad_inputs(op, false_graph, false_grad_graph)
+
+  # Make the inputs to true_grad_graph and false_grad_graph match. Note that
+  # this modifies true_grad_graph and false_grad_graph.
+  grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
+                                   true_grad_inputs, false_grad_inputs)
+
+  # Add all intermediate tensors as function outputs so they're available for
+  # higher-order gradient computations.
+
+  true_grad_intermediates = _get_intermediates(true_grad_graph)
+  false_grad_intermediates = _get_intermediates(false_grad_graph)
+
+  # Save the original number of gradient outputs to return.
+  num_grad_outputs = len(true_grad_graph.outputs)
+
+  # Make the number/type of new intermediate outputs match.
+  extra_true_grad_outputs, extra_false_grad_outputs = _pad_params(
+      true_grad_graph, false_grad_graph,
+      true_grad_intermediates, false_grad_intermediates)
+
+  true_grad_graph.outputs.extend(extra_true_grad_outputs)
+  false_grad_graph.outputs.extend(extra_false_grad_outputs)
+
+  # Create the gradient If op.
+  tensors = gen_functional_ops._if(
+      op.inputs[0], grad_inputs, [t.dtype for t in true_grad_graph.outputs],
+      _create_new_tf_function(true_grad_graph),
+      _create_new_tf_function(false_grad_graph))
+  tensors[0].op._true_graph = true_grad_graph
+  tensors[0].op._false_graph = false_grad_graph
+
+  # The predicate has no gradient.
+  return [None] + tensors[:num_grad_outputs]
+
+
+def _grad_fn(func_graph, grads):
+  """The gradient function for each conditional branch.
+
+  This function builds the gradient graph of the corresponding forward-pass
+  conditional branch in `func_graph`. This is done by differentiating
+  func_graph's outputs w.r.t. its inputs.
+
+  Args:
+    func_graph: function._FuncGraph. The corresponding forward-pass function.
+    grads: The list of input gradient Tensors.
+
+  Returns:
+    The output gradient Tensors.
+  """
+  # Filter out untrainable function outputs.
+  # NOTE(skyewm): If we don't do this, the untrainable tensors can sometimes
+  # cause _GradientsHelper to raise an exception (e.g. the implementation
+  # doesn't expect 'ys' to contain boolean tensors).
+  assert len(func_graph.outputs) == len(grads)
+  ys = []
+  grad_ys = []
+  for y, grad_y in zip(func_graph.outputs, grads):
+    if not gradients_impl._IsTrainable(y):
+      continue
+    ys.append(y)
+    grad_ys.append(grad_y)
+
+  # Build the gradient graph. Note that this builds the gradient computation of
+  # func_graph in the current graph, which requires capturing tensors from
+  # func_graph. The captured func_graph tensors are resolved to external tensors
+  # in _get_grad_inputs.
+  result = gradients_impl._GradientsHelper(
+      ys, func_graph.inputs, grad_ys=grad_ys,
+      src_graph=func_graph)
+
+  # Functions can't return None; replace Nones with zero tensors.
+  # TODO(b/80444525): don't return anything here and make _IfGrad return None if
+  # both branches have zero gradient.
+  for i in range(len(result)):
+    if result[i] is None:
+      result[i] = array_ops.zeros_like(func_graph.inputs[i])
+
+  return result
+
+
+def _create_grad_func(func_graph, grads, name):
+  """Returns the _FuncGraph representation of _grad_fn."""
+  return function.func_graph_from_py_func(lambda: _grad_fn(func_graph, grads),
+                                          [], [], name)
+
+
+def _get_grad_inputs(if_op, cond_graph, grad_graph):
+  """Returns the tensors we should pass to grad_graph.
+
+  This method handles tensors captured from cond_graph in grad_graph. It
+  converts these to suitable input tensors from the outer graph.
+
+  Args:
+    if_op: Operation. The forward-pass If op that uses cond_graph.
+    cond_graph: function._FuncGraph. The forward-pass function.
+    grad_graph: function._FuncGraph. The gradients function.
+
+  Returns:
+    A list of inputs tensors to be passed to grad_graph.
+  """
+  inputs = []
+
+  # Maps placeholders in cond_graph -> input tensor in outer graph.
+  forward_input_map = {v: k for k, v in cond_graph._captured.items()}
+
+  for t in grad_graph.extra_inputs:
+    if t.graph == ops.get_default_graph():
+      # t is in the outer graph (e.g. one of the input gradients).
+      inputs.append(t)
+    elif t in forward_input_map:
+      # t is an input placeholder in cond_graph. Get the corresponding input
+      # tensor in the outer graph.
+      assert t.graph == cond_graph
+      assert forward_input_map[t].graph == ops.get_default_graph()
+      inputs.append(forward_input_map[t])
+    else:
+      # t is an intermediate value in cond_graph. Get the corresponding output
+      # of 'if_op' (note that all intermediate values are outputs).
+      assert t.graph == cond_graph
+      output_idx = cond_graph.outputs.index(t)
+      inputs.append(if_op.outputs[output_idx])
+
+  return inputs
+
+
+def _create_new_tf_function(func_graph):
+  """Converts func_graph to a TF_Function and adds it to the current graph.
+
+  Args:
+    func_graph: function._FuncGraph
+
+  Returns:
+    The name of the new TF_Function.
+  """
+  func_graph.name = "%s_" % func_graph.name
+  c_func = c_api.TF_GraphToFunction_wrapper(
+      func_graph._c_graph,
+      func_graph.name,
+      False,  # append_hash_to_fn_name
+      None,  # opers
+      [t._as_tf_output() for t in func_graph.inputs],
+      [t._as_tf_output() for t in func_graph.outputs],
+      [],
+      None,  # opts
+      None)  # description
+  c_func = c_api_util.ScopedTFFunction(c_func)
+  c_api.TF_GraphCopyFunction(
+      ops.get_default_graph()._c_graph, c_func.func, None)
+  return func_graph.name
+
+
+def _get_intermediates(func_graph):
+  """Returns all tensors in `func_graph` that aren't inputs or outputs."""
+  intermediates = []
+  for op in func_graph.get_operations():
+    for t in op.outputs:
+      if t in func_graph.inputs: continue
+      if t in func_graph.outputs: continue
+      intermediates.append(t)
+  return intermediates
+
+
+def _separate_unique_inputs(true_inputs, false_inputs):
+  """Separates tensors appearing only in true_inputs or false_inputs, or both.
+
+  Args:
+    true_inputs: list of Tensors
+    false_inputs: list of Tensors
+
+  Returns:
+    Three lists of Tensors:
+      1. The tensors that appear in both true_inputs and false_inputs
+      2. The tensors that only appear in true_inputs
+      3. The tensors that only appear in false_inputs
+  """
+  true_inputs = set(true_inputs)
+  false_inputs = set(false_inputs)
+
+  shared_inputs = true_inputs.intersection(false_inputs)
+  true_only_inputs = true_inputs - false_inputs
+  false_only_inputs = false_inputs - true_inputs
+
+  return list(shared_inputs), list(true_only_inputs), list(false_only_inputs)
+
+
+def _pad_params(true_graph, false_graph, true_params, false_params):
+  """Returns new param lists that have matching signatures.
+
+  This is done by mirroring each param list in the other using dummy params.
+  There is no merging of params.
+
+  Args:
+    true_graph: function._FuncGraph
+    false_graph: function._FuncGraph
+    true_params: a list of Tensors from true_graph
+    false_params: a list of Tensors from false_graph
+
+  Returns:
+    A new list of Tensors in true_graph and a new list of Tensors in
+    false_graph. The two lists have the same number of Tensors, with matching
+    types and shapes across the lists.
+  """
+  new_true_params = (true_params +
+                     _create_dummy_params(true_graph, false_params))
+  new_false_inputs = (_create_dummy_params(false_graph, true_params)
+                      + false_params)
+  return new_true_params, new_false_inputs
+
+
+def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
+  """Modifies true_graph and false_graph so they have the same input signature.
+
+  This method reorders and/or adds parameters to true_graph and false_graph so
+  they have the same input signature, and updates the 'inputs', 'extra_inputs',
+  and '_captured' fields of both graphs accordingly. It uses the input tensors
+  from the outer graph to avoid duplicating shared arguments.
+
+  Args:
+    true_graph: function._FuncGraph
+    false_graph: function._FuncGraph
+    true_inputs: a list of Tensors in the outer graph. The inputs for
+      true_graph.
+    false_inputs: a list of Tensors in the outer graph. The inputs for
+      false_graph.
+
+  Returns:
+    A new list of Tensors from the outer graph that are the new inputs for both
+    true_graph and false_graph. This is a deduped version of true_inputs +
+    false_inputs.
+  """
+  shared_inputs, true_only_inputs, false_only_inputs = _separate_unique_inputs(
+      true_inputs, false_inputs)
+
+  new_inputs = shared_inputs + true_only_inputs + false_only_inputs
+
+  true_input_to_param = dict(zip(true_inputs, true_graph.inputs))
+  false_input_to_param = dict(zip(false_inputs, false_graph.inputs))
+
+  true_graph.inputs = (
+      [true_input_to_param[t] for t in shared_inputs] +
+      [true_input_to_param[t] for t in true_only_inputs] +
+      _create_dummy_params(true_graph, false_only_inputs))
+
+  false_graph.inputs = (
+      [false_input_to_param[t] for t in shared_inputs] +
+      _create_dummy_params(false_graph, true_only_inputs) +
+      [false_input_to_param[t] for t in false_only_inputs])
+
+  # Rewrite the _FuncGraphs' state to reflect the new inputs.
+  true_graph.extra_inputs = new_inputs
+  false_graph.extra_inputs = new_inputs
+
+  true_graph._captured = dict(zip(new_inputs, true_graph.inputs))
+  false_graph._captured = dict(zip(new_inputs, false_graph.inputs))
+
+  return new_inputs
+
+
+def _create_dummy_params(func_graph, template_tensors):
+  """Creates tensors in func_graph to represent template_tensors.
+
+  Args:
+    func_graph: function._FuncGraph.
+    template_tensors: a list of tensors in the outer graph.
+
+  Returns:
+    A list of tensors in func_graph.
+  """
+  with func_graph.as_default():
+    return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
+            for t in template_tensors]
+
+
+def _check_same_outputs(true_graph, false_graph):
+  """Raises an error if true_graph and false_graph have different outputs."""
+  true_output_types = [t.dtype for t in true_graph.outputs]
+  false_output_types = [t.dtype for t in false_graph.outputs]
+  if (len(true_graph.outputs) != len(false_graph.outputs) or
+      true_output_types != false_output_types):
+    raise ValueError(
+        "true_fn() and false_fn() must return the same number and type of "
+        "arguments, got:\n"
+        "  true_fn: %s\n"
+        "  false_fn: %s" % (true_output_types, false_output_types))
diff --git a/tensorflow/contrib/control_flow/python/cond_v2_test.py b/tensorflow/contrib/control_flow/python/cond_v2_test.py
new file mode 100644
index 0000000000..c94f3a6584
--- /dev/null
+++ b/tensorflow/contrib/control_flow/python/cond_v2_test.py
@@ -0,0 +1,113 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for cond_v2."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.control_flow.python import cond_v2
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class NewCondTest(test.TestCase):
+
+  def _testCond(self, true_fn, false_fn, train_vals):
+    pred = array_ops.placeholder(dtypes.bool, name="pred")
+
+    expected = control_flow_ops.cond(pred, true_fn, false_fn, name="expected")
+    actual = cond_v2.cond_v2(pred, true_fn, false_fn, name="actual")
+
+    expected_grad = gradients_impl.gradients(expected, train_vals)
+    actual_grad = gradients_impl.gradients(actual, train_vals)
+
+    with self.test_session() as sess:
+      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
+          (expected, actual, expected_grad, actual_grad), {pred: True})
+      self.assertEqual(expected_val, actual_val)
+      self.assertEqual(expected_grad_val, actual_grad_val)
+
+      expected_val, actual_val, expected_grad_val, actual_grad_val = sess.run(
+          (expected, actual, expected_grad, actual_grad), {pred: False})
+      self.assertEqual(expected_val, actual_val)
+      self.assertEqual(expected_grad_val, actual_grad_val)
+
+  def testBasic(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+      return x * 2.0
+
+    def false_fn():
+      return y * 3.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testBasic2(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(2.0, name="y")
+
+    def true_fn():
+      return x * y * 2.0
+
+    def false_fn():
+      return 2.0
+
+    self._testCond(true_fn, false_fn, [x])
+    self._testCond(true_fn, false_fn, [x, y])
+    self._testCond(true_fn, false_fn, [y])
+
+  def testSecondDerivative(self):
+    pred = array_ops.placeholder(dtypes.bool, name="pred")
+    x = constant_op.constant(3.0, name="x")
+
+    def true_fn():
+      return math_ops.pow(x, 3)
+
+    def false_fn():
+      return x
+
+    cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond")
+    cond_grad = gradients_impl.gradients(cond, [x])
+    cond_grad_grad = gradients_impl.gradients(cond_grad, [x])
+
+    with self.test_session() as sess:
+      # d[x^3]/dx = 3x^2
+      true_val = sess.run(cond_grad, {pred: True})
+      self.assertEqual(true_val, [27.0])
+      # d[x]/dx = 1
+      false_val = sess.run(cond_grad, {pred: False})
+      self.assertEqual(false_val, [1.0])
+
+      true_val = sess.run(cond_grad_grad, {pred: True})
+      # d2[x^3]/dx2 = 6x
+      self.assertEqual(true_val, [18.0])
+      false_val = sess.run(cond_grad_grad, {pred: False})
+      # d2[x]/dx2 = 0
+      self.assertEqual(false_val, [0.0])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt b/tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt
new file mode 100644
index 0000000000..d110aba42b
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FakeParam.pbtxt
@@ -0,0 +1,24 @@
+op {
+  graph_op_name: "FakeParam"
+  visibility: SKIP
+  out_arg {
+    name: "output"
+    description: <<END
+    \"Fake\" output value. This should not be consumed by another op.
+END
+  }
+  attr { name: "dtype"  description: "The type of the output." }
+  attr {
+    name: "shape"
+    description: <<END
+    The purported shape of the output. This is only used for shape inference;
+    the output will not necessarily have this shape. Can be a partial shape.
+END
+  }
+  summary: <<END
+  This op is used as a placeholder in If branch functions. It doesn't provide a
+  valid output when run, so must either be removed (e.g. replaced with a
+  function input) or guaranteed not to be used (e.g. if mirroring an
+  intermediate output needed for the gradient computation of the other branch).
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt b/tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt
new file mode 100644
index 0000000000..57fa8ff5b9
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FakeParam.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FakeParam"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 9ae04a1062..e0d594fa25 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -518,5 +518,24 @@ REGISTER_KERNEL_BUILDER(Name("For")
                             .HostMemory("delta"),
                         ForOp);
 
+class FakeParamOp : public OpKernel {
+ public:
+  explicit FakeParamOp(OpKernelConstruction* context) : OpKernel(context) {
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // We must produce something (only Switch and Recvs are allowed to output
+    // dead tensors). This output is not expected to be consumed by anything.
+    Tensor output_tensor(dtype_, TensorShape({}));
+    context->set_output(0, output_tensor);
+  }
+
+ private:
+  DataType dtype_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index 4d4a370478..a6cc4b60e5 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -154,4 +154,21 @@ REGISTER_OP("PartitionedCall")
     .Attr("f: func")
     .SetShapeFn(shape_inference::UnknownShape);
 
+// This op is used as a placeholder in If branch functions. It doesn't provide a
+// valid output when run, so must either be removed (e.g. replaced with a
+// function input) or guaranteed not to be used (e.g. if mirroring an
+// intermediate output needed for the gradient computation of the other branch).
+REGISTER_OP("FakeParam")
+    .Output("output: dtype")
+    .Attr("dtype: type")
+    .Attr("shape: shape")
+    .SetShapeFn([](InferenceContext* c) {
+      PartialTensorShape shape;
+      TF_RETURN_IF_ERROR(c->GetAttr("shape", &shape));
+      shape_inference::ShapeHandle out;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromPartialTensorShape(shape, &out));
+      c->set_output(0, out);
+      return Status::OK();
+    });
+
 }  // end namespace tensorflow
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index a8a514d166..c2f7794c3b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -1050,7 +1050,10 @@ py_test(
 
 tf_gen_op_wrapper_private_py(
     name = "functional_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/contrib/control_flow:__pkg__",
+    ],
 )
 
 py_library(
-- 
GitLab


From 1e92632c5d22c7815943343c8e634805f3152707 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 13:20:56 -0700
Subject: [PATCH 327/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 199347316
---
 tensorflow/core/ops/compat/ops_history.v1.pbtxt | 15 +++++++++++++++
 tensorflow/core/ops/ops.pbtxt                   | 15 +++++++++++++++
 2 files changed, 30 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 61cc3f7c2e..16e9b2e02e 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -21496,6 +21496,21 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "FakeParam"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
 op {
   name: "FakeQuantWithMinMaxArgs"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index e73e034340..7df43663c9 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -10003,6 +10003,21 @@ op {
     type: DT_STRING
   }
 }
+op {
+  name: "FakeParam"
+  output_arg {
+    name: "output"
+    type_attr: "dtype"
+  }
+  attr {
+    name: "dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape"
+    type: "shape"
+  }
+}
 op {
   name: "FakeQuantWithMinMaxArgs"
   input_arg {
-- 
GitLab


From 70a96b53aa5328b3616e7e4fc33cb9f714522e8e Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 5 Jun 2018 13:35:59 -0700
Subject: [PATCH 328/610] Allow calling getanno with a default value. Failure
 is still the default behavior.

PiperOrigin-RevId: 199349592
---
 tensorflow/contrib/autograph/pyct/anno.py     | 19 +++++++++++++++----
 .../contrib/autograph/pyct/anno_test.py       |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/anno.py b/tensorflow/contrib/autograph/pyct/anno.py
index cc4a7edf02..81d5b93da1 100644
--- a/tensorflow/contrib/autograph/pyct/anno.py
+++ b/tensorflow/contrib/autograph/pyct/anno.py
@@ -46,8 +46,15 @@ class Basic(NoValue):
       '`name_map` allows renaming symbols.')
 
 
-def getanno(node, key, field_name='___pyct_anno'):
-  return getattr(node, field_name)[key]
+FAIL = object()
+
+
+def getanno(node, key, default=FAIL, field_name='___pyct_anno'):
+  if (default is FAIL or
+      (hasattr(node, field_name) and getattr(node, field_name)[key])):
+    return getattr(node, field_name)[key]
+  else:
+    return default
 
 
 def hasanno(node, key, field_name='___pyct_anno'):
@@ -73,5 +80,9 @@ def delanno(node, key, field_name='___pyct_anno'):
 
 
 def copyanno(from_node, to_node, key, field_name='___pyct_anno'):
-  if hasanno(from_node, key, field_name):
-    setanno(to_node, key, getanno(from_node, key, field_name), field_name)
+  if hasanno(from_node, key, field_name=field_name):
+    setanno(
+        to_node,
+        key,
+        getanno(from_node, key, field_name=field_name),
+        field_name=field_name)
diff --git a/tensorflow/contrib/autograph/pyct/anno_test.py b/tensorflow/contrib/autograph/pyct/anno_test.py
index 1d4d9d119e..d4caa3dd11 100644
--- a/tensorflow/contrib/autograph/pyct/anno_test.py
+++ b/tensorflow/contrib/autograph/pyct/anno_test.py
@@ -44,6 +44,7 @@ class AnnoTest(test.TestCase):
     self.assertFalse(anno.hasanno(node, 'foo'))
     with self.assertRaises(AttributeError):
       anno.getanno(node, 'foo')
+    self.assertIsNone(anno.getanno(node, 'foo', default=None))
 
   def test_copyanno(self):
     node_1 = ast.Name()
-- 
GitLab


From 92ceec1c2729d162e891ac91c28e4b1222e65ebe Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Tue, 5 Jun 2018 13:43:20 -0700
Subject: [PATCH 329/610] Fix test MultiOutputReduceFusionScalar to use an
 identity value as reduction init_value.

PiperOrigin-RevId: 199350818
---
 tensorflow/compiler/xla/tests/multioutput_fusion_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 3cbb2452fb..7bfc8eb546 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -357,9 +357,9 @@ XLA_TEST_F(MultiOutputFusionTest,
       c0 = f32[] constant(0)
       r1 = f32[2]{0} reduce(p0, c0), dimensions={0,2}, to_apply=Add
       mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
-      c1 = f32[] constant(5)
+      c1 = f32[] constant(1.17549e-38)
       r2 = f32[2]{0} reduce(mul, c1), dimensions={0,2}, to_apply=Max
-      r3 = f32[2]{0} reduce(mul, c1), dimensions={0,2}, to_apply=Add
+      r3 = f32[2]{0} reduce(mul, c0), dimensions={0,2}, to_apply=Add
       ROOT tuple = (f32[2]{0}, f32[2]{0}, f32[2]{0}) tuple(r1, r2, r3)
     }
 
@@ -377,7 +377,7 @@ XLA_TEST_F(MultiOutputFusionTest,
   EXPECT_TRUE(LiteralTestUtil::Equal(
       *result, *Literal::MakeTupleOwned(Literal::CreateR1<float>({14, 22}),
                                         Literal::CreateR1<float>({36, 64}),
-                                        Literal::CreateR1<float>({391, 463}))));
+                                        Literal::CreateR1<float>({66, 138}))));
 }
 
 }  // namespace
-- 
GitLab


From c03d2c43b988a3cd8161b203cd41cc7f234daa31 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 13:48:40 -0700
Subject: [PATCH 330/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 199351707

---
 tensorflow/go/op/wrappers.go | 120 +++++++++++++++++------------------
 1 file changed, 60 insertions(+), 60 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index e4f22692d8..550ef8944d 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -21947,46 +21947,6 @@ func MatrixExponential(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the matrix logarithm of one or more square matrices:
-//
-//
-// log(exp(A)) = A
-//
-// This op is only defined for complex matrices. If A is positive-definite and
-// real, then casting to a complex matrix, taking the logarithm and casting back
-// to a real matrix will give the correct result.
-//
-// This function computes the matrix logarithm using the Schur-Parlett algorithm.
-// Details of the algorithm can be found in Section 11.6.2 of:
-// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
-// ISBN 978-0-898716-46-7.
-//
-// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
-// form square matrices. The output is a tensor of the same shape as the input
-// containing the exponential for all input submatrices `[..., :, :]`.
-//
-// Arguments:
-//	input: Shape is `[..., M, M]`.
-//
-// Returns Shape is `[..., M, M]`.
-//
-// @compatibility(scipy)
-// Equivalent to scipy.linalg.logm
-// @end_compatibility
-func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "MatrixLogarithm",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // QueueDequeueUpToV2Attr is an optional argument to QueueDequeueUpToV2.
 type QueueDequeueUpToV2Attr func(optionalAttr)
 
@@ -24398,6 +24358,46 @@ func NonMaxSuppressionV2(scope *Scope, boxes tf.Output, scores tf.Output, max_ou
 	return op.Output(0)
 }
 
+// Computes the matrix logarithm of one or more square matrices:
+//
+//
+// log(exp(A)) = A
+//
+// This op is only defined for complex matrices. If A is positive-definite and
+// real, then casting to a complex matrix, taking the logarithm and casting back
+// to a real matrix will give the correct result.
+//
+// This function computes the matrix logarithm using the Schur-Parlett algorithm.
+// Details of the algorithm can be found in Section 11.6.2 of:
+// Nicholas J. Higham, Functions of Matrices: Theory and Computation, SIAM 2008.
+// ISBN 978-0-898716-46-7.
+//
+// The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+// form square matrices. The output is a tensor of the same shape as the input
+// containing the exponential for all input submatrices `[..., :, :]`.
+//
+// Arguments:
+//	input: Shape is `[..., M, M]`.
+//
+// Returns Shape is `[..., M, M]`.
+//
+// @compatibility(scipy)
+// Equivalent to scipy.linalg.logm
+// @end_compatibility
+func MatrixLogarithm(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "MatrixLogarithm",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // EncodeProtoAttr is an optional argument to EncodeProto.
 type EncodeProtoAttr func(optionalAttr)
 
@@ -29425,6 +29425,26 @@ func Snapshot(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Returns a tensor of zeros with the same shape and type as x.
+//
+// Arguments:
+//	x: a tensor of type T.
+//
+// Returns a tensor of the same shape and type as x but filled with zeros.
+func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ZerosLike",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // AbortAttr is an optional argument to Abort.
 type AbortAttr func(optionalAttr)
 
@@ -30690,23 +30710,3 @@ func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Returns a tensor of zeros with the same shape and type as x.
-//
-// Arguments:
-//	x: a tensor of type T.
-//
-// Returns a tensor of the same shape and type as x but filled with zeros.
-func ZerosLike(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ZerosLike",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-- 
GitLab


From 12b20a53542a2037346432e8573e02a828ab9bc2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 14:03:16 -0700
Subject: [PATCH 331/610] No longer assume that the default job is "localhost"
 in graph mode DistributionStrategy, since it depends on the session. Drop
 "job:localhost" when canonicalizing in graph mode.

PiperOrigin-RevId: 199354215
---
 tensorflow/python/training/device_util.py      | 10 +++++++---
 tensorflow/python/training/device_util_test.py |  2 +-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/training/device_util.py
index e31fa02d60..70e1ca4b5d 100644
--- a/tensorflow/python/training/device_util.py
+++ b/tensorflow/python/training/device_util.py
@@ -27,13 +27,15 @@ def canonicalize(d, default=None):
   """Canonicalize device string.
 
   If d has missing components, the rest would be deduced from the `default`
-  argument or from '/job:localhost/replica:0/task:0/device:CPU:0'. For example:
+  argument or from '/replica:0/task:0/device:CPU:0'. For example:
     If d = '/cpu:0', default='/job:worker/task:1', it returns
       '/job:worker/replica:0/task:1/device:CPU:0'.
     If d = '/cpu:0', default='/job:worker', it returns
       '/job:worker/replica:0/task:0/device:CPU:0'.
     If d = '/gpu:0', default=None, it returns
-      '/job:localhost/replica:0/task:0/device:GPU:0'.
+      '/replica:0/task:0/device:GPU:0'.
+
+  Note: This uses "job:localhost" as the default if executing eagerly.
 
   Args:
     d: a device string.
@@ -47,7 +49,9 @@ def canonicalize(d, default=None):
       "Device type '%s' must be all-caps." % (d.device_type,))
   # Fill in missing device fields using defaults.
   result = tf_device.DeviceSpec(
-      job="localhost", replica=0, task=0, device_type="CPU", device_index=0)
+      replica=0, task=0, device_type="CPU", device_index=0)
+  if context.executing_eagerly():
+    result.job = "localhost"
   if default:
     result.merge_from(tf_device.DeviceSpec.from_string(default))
   result.merge_from(d)
diff --git a/tensorflow/python/training/device_util_test.py b/tensorflow/python/training/device_util_test.py
index 61525e21f5..cdbb08229d 100644
--- a/tensorflow/python/training/device_util_test.py
+++ b/tensorflow/python/training/device_util_test.py
@@ -52,7 +52,7 @@ class DeviceUtilTest(test.TestCase):
   def testCanonicalizeWithoutDefaultDevice(self):
     self.assertEqual(
         device_util.canonicalize("/cpu:0"),
-        "/job:localhost/replica:0/task:0/device:CPU:0")
+        "/replica:0/task:0/device:CPU:0")
     self.assertEqual(
         device_util.canonicalize("/job:worker/cpu:0"),
         "/job:worker/replica:0/task:0/device:CPU:0")
-- 
GitLab


From 755513739a531af7a1d0ee4f3faa1a129ba030b7 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 5 Jun 2018 14:28:20 -0700
Subject: [PATCH 332/610] contrib/eagerpython/datasets: Resource naming
 workaround.

tensorflow/contrib/eager/python/datasets_test.py was failing on GPU
because two tests - testTensorsPlacedOnDevice() and
testTensorsExplicitPrefetchToDevice() we're creating
FunctionBufferResources with the same shared_name, leading to
unintentional interference.

This change will make the tests pass and allow the use of
tf.contrib.eager.Iterator and
tf.data.Dataset.apply(prefetching_ops.prefetch_to_device)
in the same process without interference.

However, a more appropriate fix would probably be to use
anonymous function buffering resources (similar to
AnonymousIteratorHandle) when eager execution is enabled,
doing away with sharing by name.
---
 tensorflow/contrib/eager/python/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d7909dd5a2..adf92c27ea 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,7 +106,8 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
-- 
GitLab


From d935dd9d992e9632bd2e3234fd5151a3f541f4df Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Tue, 5 Jun 2018 14:45:45 -0700
Subject: [PATCH 333/610] Update TOCO Python command line flags.

PiperOrigin-RevId: 199361276
---
 tensorflow/contrib/lite/python/lite.py        |  8 +++++++
 .../contrib/lite/python/tflite_convert.py     | 24 ++++++++++++-------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 2cb06e2559..0ccd6675db 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -305,6 +305,14 @@ class TocoConverter(object):
         allow_custom_ops=self.allow_custom_ops)
     return result
 
+  def get_input_arrays(self):
+    """Returns a list of the names of the input tensors.
+
+    Returns:
+      List of strings.
+    """
+    return [tensor_name(tensor) for tensor in self._input_tensors]
+
   def _set_batch_size(self, batch_size):
     """Sets the first dimension of the input tensor to `batch_size`.
 
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 337f05785e..d0879daed2 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -86,6 +86,9 @@ def _convert_model(flags):
 
   Args:
     flags: argparse.Namespace object.
+
+  Raises:
+    ValueError: Invalid flags.
   """
   # Create converter.
   converter = _get_toco_converter(flags)
@@ -99,10 +102,19 @@ def _convert_model(flags):
         flags.output_format)
 
   if flags.mean_values and flags.std_dev_values:
-    input_arrays = _parse_array(flags.input_arrays)
+    input_arrays = converter.get_input_arrays()
     std_dev_values = _parse_int_array(flags.std_dev_values)
     mean_values = _parse_int_array(flags.mean_values)
     quant_stats = zip(mean_values, std_dev_values)
+    if ((not flags.input_arrays and len(input_arrays) > 1) or
+        (len(input_arrays) != len(quant_stats))):
+      raise ValueError("Mismatching --input_arrays, --std_dev_values, and "
+                       "--mean_values. The flags must have the same number of "
+                       "items. The current input arrays are '{0}'. "
+                       "--input_arrays must be present when specifying "
+                       "--std_dev_values and --mean_values with multiple input "
+                       "tensors in order to map between names and "
+                       "values".format(",".join(input_arrays)))
     converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
   if flags.default_ranges_min and flags.default_ranges_max:
     converter.default_ranges_stats = (flags.default_ranges_min,
@@ -168,13 +180,9 @@ def _check_flags(flags, unparsed):
     if bool(flags.std_dev_values) != bool(flags.mean_values):
       raise ValueError("--std_dev_values and --mean_values must be used "
                        "together")
-    if not flags.input_arrays:
-      raise ValueError("--std_dev_values and --mean_values must be used with "
-                       "--input_arrays")
-    if (flags.std_dev_values.count(",") != flags.mean_values.count(",") or
-        flags.std_dev_values.count(",") != flags.input_arrays.count(",")):
-      raise ValueError("--std_dev_values, --mean_values, and --input_arrays "
-                       "must have the same number of items")
+    if flags.std_dev_values.count(",") != flags.mean_values.count(","):
+      raise ValueError("--std_dev_values, --mean_values must have the same "
+                       "number of items")
 
   if bool(flags.default_ranges_min) != bool(flags.default_ranges_max):
     raise ValueError("--default_ranges_min and --default_ranges_max must be "
-- 
GitLab


From f0230735d1225f914d50824208cd7f84492a6dd3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 14:46:22 -0700
Subject: [PATCH 334/610] [XLA] Redesign: delete SessionModule.

PiperOrigin-RevId: 199361402
---
 tensorflow/compiler/xla/BUILD                 |  1 -
 tensorflow/compiler/xla/client/BUILD          |  1 +
 .../compiler/xla/client/local_client.cc       | 22 ++---
 tensorflow/compiler/xla/client/local_client.h |  6 +-
 tensorflow/compiler/xla/service/BUILD         | 10 ---
 .../compiler/xla/service/channel_tracker.h    |  1 -
 tensorflow/compiler/xla/service/executable.cc | 34 --------
 tensorflow/compiler/xla/service/executable.h  | 16 ----
 tensorflow/compiler/xla/service/service.cc    | 28 ------
 tensorflow/compiler/xla/service/service.h     |  1 -
 tensorflow/compiler/xla/service/session.proto | 85 -------------------
 tensorflow/compiler/xla/tools/BUILD           |  2 +-
 .../compiler/xla/tools/convert_computation.cc |  4 +-
 tensorflow/compiler/xla/xla.proto             |  9 --
 14 files changed, 18 insertions(+), 202 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/session.proto

diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index c6deb959a5..1b8e516770 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -53,7 +53,6 @@ xla_proto_library(
     deps = [
         ":xla_data_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
-        "//tensorflow/compiler/xla/service:session_proto",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index c4f0c4468f..8f08d3b2e0 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -110,6 +110,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:compiler",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
         "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
         "//tensorflow/compiler/xla/service:source_map_util",
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f9003373a6..ae0308020d 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -185,7 +185,7 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
       run_options, backend_->StreamBorrower(),
       backend_->eigen_intra_op_thread_pool());
 
-  if (executable_->dumping()) {
+  if (executable_->dumping_snapshot()) {
     return ExecuteAndDump(&service_options, arguments);
   }
   return executable_->ExecuteOnStreamWrapper(
@@ -195,36 +195,36 @@ StatusOr<ScopedShapedBuffer> LocalExecutable::Run(
 StatusOr<ScopedShapedBuffer> LocalExecutable::ExecuteAndDump(
     const ServiceExecutableRunOptions* run_options,
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments) {
-  executable_->session_module()->set_execution_platform(
+  executable_->hlo_snapshot()->set_execution_platform(
       backend_->platform()->Name());
-  TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->session_module()));
+  TF_RETURN_IF_ERROR(RecordArguments(arguments, executable_->hlo_snapshot()));
   TF_ASSIGN_OR_RETURN(
       ScopedShapedBuffer result,
       executable_->ExecuteOnStream(run_options, arguments,
                                    /*hlo_execution_profile=*/nullptr));
-  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->session_module()));
-  TF_RETURN_IF_ERROR(executable_->DumpSessionModule());
+  TF_RETURN_IF_ERROR(RecordResult(&result, executable_->hlo_snapshot()));
+  TF_RETURN_IF_ERROR(executable_->DumpHloSnapshot());
   return std::move(result);
 }
 
 Status LocalExecutable::RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    SessionModule* session_module) {
-  session_module->clear_arguments();
+    HloSnapshot* hlo_snapshot) {
+  hlo_snapshot->clear_arguments();
   for (const ShapedBuffer* argument : arguments) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                         LiteralFromShapedBuffer(*argument));
-    *session_module->add_arguments() = literal->ToProto();
+    *hlo_snapshot->add_arguments() = literal->ToProto();
   }
   return Status::OK();
 }
 
 Status LocalExecutable::RecordResult(const ShapedBuffer* result,
-                                     SessionModule* session_module) {
-  session_module->clear_result();
+                                     HloSnapshot* hlo_snapshot) {
+  hlo_snapshot->clear_result();
   TF_ASSIGN_OR_RETURN(std::unique_ptr<Literal> literal,
                       LiteralFromShapedBuffer(*result));
-  *session_module->mutable_result() = literal->ToProto();
+  *hlo_snapshot->mutable_result() = literal->ToProto();
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 5b408cc6b2..4d9e0d7cd9 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/local_service.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -78,11 +79,10 @@ class LocalExecutable {
   // proto.
   Status RecordArguments(
       const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-      SessionModule* session_module);
+      HloSnapshot* hlo_snapshot);
 
   // Records the result of the computation in a SessionModule proto.
-  Status RecordResult(const ShapedBuffer* result,
-                      SessionModule* session_module);
+  Status RecordResult(const ShapedBuffer* result, HloSnapshot* hlo_snapshot);
 
   // Returns a literal containing the contents of the given ShapedBuffer.
   StatusOr<std::unique_ptr<Literal>> LiteralFromShapedBuffer(
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 75961d49a5..345f5ddeb2 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -21,13 +21,6 @@ load(
     "tf_proto_library_py",
 )
 
-xla_proto_library(
-    name = "session_proto",
-    srcs = ["session.proto"],
-    visibility = ["//visibility:public"],
-    deps = ["//tensorflow/compiler/xla:xla_data_proto"],
-)
-
 xla_proto_library(
     name = "hlo_proto",
     srcs = ["hlo.proto"],
@@ -608,7 +601,6 @@ cc_library(
         ":hlo_module_config",
         ":hlo_proto_util",
         ":platform_util",
-        ":session_proto",
         ":source_map_util",
         ":transfer_manager",
         ":versioned_computation_handle",
@@ -766,7 +758,6 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_proto",
         ":pool",
-        ":session_proto",
         ":shaped_buffer",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
@@ -870,7 +861,6 @@ cc_library(
     hdrs = ["channel_tracker.h"],
     deps = [
         ":hlo",
-        ":session_proto",
         ":versioned_computation_handle",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index e415fb27e6..52f33a1318 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <map>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/executable.cc b/tensorflow/compiler/xla/service/executable.cc
index 8119478ce9..6df172db8e 100644
--- a/tensorflow/compiler/xla/service/executable.cc
+++ b/tensorflow/compiler/xla/service/executable.cc
@@ -129,20 +129,6 @@ StatusOr<ScopedShapedBuffer> Executable::ExecuteOnStreamWrapper(
   return return_value;
 }
 
-Status Executable::DumpSessionModule() {
-  TF_RET_CHECK(dumping());
-  const string& directory_path =
-      module_config().debug_options().xla_dump_executions_to();
-  VersionedComputationHandle versioned_handle = entry_computation_handle();
-  // This filename does not include the version number because the computation
-  // is only ever executed at one version.
-  string filename = tensorflow::strings::Printf(
-      "computation_%lld__%s__execution_%lld", versioned_handle.handle.handle(),
-      session_module_->entry().name().c_str(), ++execution_count_);
-  return Executable::DumpToDirectory(directory_path, filename,
-                                     *session_module_);
-}
-
 Status Executable::DumpHloSnapshot() {
   TF_RET_CHECK(dumping_snapshot());
   TF_RET_CHECK(hlo_snapshot_->has_hlo() &&
@@ -156,26 +142,6 @@ Status Executable::DumpHloSnapshot() {
   return Executable::DumpToDirectory(directory_path, filename, *hlo_snapshot_);
 }
 
-/* static */ Status Executable::DumpToDirectory(
-    const string& directory_path, string filename,
-    const SessionModule& session_module) {
-  tensorflow::Env* env = tensorflow::Env::Default();
-  if (!env->IsDirectory(directory_path).ok()) {
-    // NB! CreateDir does not work reliably with multiple XLA threads -- two
-    // threads can race to observe the absence of the dump directory and
-    // simultaneously try to create it, causing the "losing" thread to get a
-    // "directory already exists" error.
-    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(directory_path));
-  }
-  filename = SanitizeFileName(std::move(filename));
-  string file_path = tensorflow::io::JoinPath(directory_path, filename);
-  string result;
-  TF_RET_CHECK(
-      tensorflow::SerializeToStringDeterministic(session_module, &result));
-  return tensorflow::WriteStringToFile(tensorflow::Env::Default(), file_path,
-                                       result);
-}
-
 /* static */ Status Executable::DumpToDirectory(
     const string& directory_path, string filename,
     const HloSnapshot& hlo_session) {
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 4f0466c544..087bd14329 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -144,14 +143,6 @@ class Executable {
     return hlo_module_->config().host_entry_computation_layout().result_shape();
   }
 
-  // TODO(b/74197823): Delete the session module dumping helpers.
-  void set_session_module(std::unique_ptr<xla::SessionModule> session_module) {
-    session_module_ = std::move(session_module);
-  }
-  bool dumping() const { return session_module_ != nullptr; }
-  SessionModule* session_module() const { return session_module_.get(); }
-  Status DumpSessionModule();
-
   // Dumping helpers.
   void set_hlo_snapshot(std::unique_ptr<xla::HloSnapshot> hlo_snapshot) {
     hlo_snapshot_ = std::move(hlo_snapshot);
@@ -160,10 +151,6 @@ class Executable {
   HloSnapshot* hlo_snapshot() const { return hlo_snapshot_.get(); }
   Status DumpHloSnapshot();
 
-  // Dump session_module to directory_path/filename.
-  static Status DumpToDirectory(const string& directory_path, string filename,
-                                const SessionModule& session_module);
-
   // Dump hlo snapshot to directory_path/filename.
   static Status DumpToDirectory(const string& directory_path, string filename,
                                 const HloSnapshot& hlo_session);
@@ -179,9 +166,6 @@ class Executable {
   // around.
   const std::unique_ptr<const HloModule> hlo_module_;
 
-  // SessionModule this was compiled from. Null if not dumping executions.
-  std::unique_ptr<SessionModule> session_module_;
-
   // HloSnapshot this was compiled from. Null if not dumping executions.
   std::unique_ptr<HloSnapshot> hlo_snapshot_;
 
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 82be6bcf4f..d01c35b992 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -36,7 +36,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
@@ -62,33 +61,6 @@ namespace xla {
 
 namespace {
 
-// Records the arguments used to invoke a computation in a SessionModule
-// proto.
-Status RecordArguments(
-    const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
-    se::StreamExecutor* executor, TransferManager* transfer_manager,
-    SessionModule* module) {
-  module->clear_arguments();
-  for (const ShapedBuffer* argument : arguments) {
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<Literal> literal,
-        transfer_manager->TransferLiteralFromDevice(executor, *argument));
-    *module->add_arguments() = literal->ToProto();
-  }
-  return Status::OK();
-}
-
-// Records the result of a computation in a SessionModule proto.
-Status RecordResult(const ShapedBuffer& result, se::StreamExecutor* executor,
-                    TransferManager* transfer_manager, SessionModule* module) {
-  module->clear_result();
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<Literal> literal,
-      transfer_manager->TransferLiteralFromDevice(executor, result));
-  *module->mutable_result() = literal->ToProto();
-  return Status::OK();
-}
-
 // Records the arguments used to invoke a computation in an HloSnapshot proto.
 Status RecordArguments(
     const tensorflow::gtl::ArraySlice<const ShapedBuffer*> arguments,
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index 422bb95657..d64b2b4d0a 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/session.pb.h"
 #include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
diff --git a/tensorflow/compiler/xla/service/session.proto b/tensorflow/compiler/xla/service/session.proto
deleted file mode 100644
index bb8d1cd2a1..0000000000
--- a/tensorflow/compiler/xla/service/session.proto
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This proto file defines messages which store the state of XLA
-// computations within the XLA service. A computation is stored as a record
-// of the operation requests used to build it.
-syntax = "proto3";
-
-import "tensorflow/compiler/xla/xla_data.proto";
-
-package xla;
-
-// Describes a single operation request.
-message OperationRequest {
-  ComputationDataHandle output_handle = 1;
-  Shape output_shape = 2;
-
-  // For operations which call embedded computations such as "Map", these are
-  // the version(s) that the embedded computation should be called at. A version
-  // value of a computation is the ComputationDataHandle of the root of the
-  // computation at the point in time.
-  //
-  // "Call", "Map", "Reduce", and "ReduceWindow" operations take a single
-  // embedded computation so this field will have a single value for those
-  // operations.
-  //
-  // "While" operation takes two; index 0 is the "condition" version and index 1
-  // is the "body" version.
-  repeated int64 embedded_computation_versions = 3;
-
-  // The actual request, which in itself is a tagged union of all possible
-  // operation request types.
-  OpRequest request = 4;
-}
-
-// Describes a sequence of operation requests which define an XLA
-// computation.
-message SessionComputation {
-  string name = 1;
-
-  // The ComputationHandle used to refer to this computation in the XLA
-  // service.
-  ComputationHandle computation_handle = 2;
-
-  // Map from ComputationDataHandle value to operation request. The highest
-  // ComputationDataHandle value corresponds to the root of the computation.
-  map<int64, OperationRequest> requests = 3;
-}
-
-// Describes a group of SessionComputations with an "entry point" computation
-// that may refer to the other non-entry (AKA embedded) computations.
-//
-// This message is used to serialize a computation that has been built via the
-// XLA service API, along with its dependencies, for purposes such as
-// analysis/replay/file-storage.
-message SessionModule {
-  // The entry computation, which was requested for serialization. This may have
-  // referred to embedded computations, which are reflected below.
-  SessionComputation entry = 1;
-
-  // Embedded computations that are transitively referred to by the entry
-  // computation.
-  repeated SessionComputation embedded_computations = 2;
-
-  // The arguments passed to the computation.
-  repeated LiteralProto arguments = 3;
-
-  // The result of the computation.
-  LiteralProto result = 4;
-
-  // The name of the platform used to run the computation.
-  string execution_platform = 5;
-}
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index d73bcdaf82..ff5340ee3f 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -135,7 +135,7 @@ tf_cc_binary(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/service:session_proto",
+        "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/core:lib",
     ],
 )
diff --git a/tensorflow/compiler/xla/tools/convert_computation.cc b/tensorflow/compiler/xla/tools/convert_computation.cc
index fe03a6e7bd..14d01b5bfb 100644
--- a/tensorflow/compiler/xla/tools/convert_computation.cc
+++ b/tensorflow/compiler/xla/tools/convert_computation.cc
@@ -21,7 +21,7 @@ limitations under the License.
 #include <unistd.h>
 #include <string>
 
-#include "tensorflow/compiler/xla/service/session.pb.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/env.h"
@@ -33,7 +33,7 @@ namespace xla {
 namespace tools {
 
 void RealMain(const string& mode, const string& path) {
-  SessionModule module;
+  HloSnapshot module;
   tensorflow::Env* env = tensorflow::Env::Default();
   if (mode == "txt2bin") {
     TF_CHECK_OK(tensorflow::ReadTextProto(env, path, &module));
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index f619b8dc24..53ba120d21 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -17,7 +17,6 @@ syntax = "proto3";
 
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
-import "tensorflow/compiler/xla/service/session.proto";
 
 package xla;
 
@@ -230,14 +229,6 @@ message SnapshotComputationRequest {
   ComputationHandle computation = 1;
 }
 
-message SnapshotComputationResponse {
-  SessionModule module = 1;
-}
-
-message LoadComputationSnapshotRequest {
-  SessionModule module = 1;
-}
-
 message LoadComputationSnapshotResponse {
   ComputationHandle computation = 1;
 }
-- 
GitLab


From e0c9871e2a8dbe5e07f59c8788b0914d5079b04f Mon Sep 17 00:00:00 2001
From: Taylor Robie <taylorrobie@google.com>
Date: Tue, 5 Jun 2018 14:55:14 -0700
Subject: [PATCH 335/610] Typo fix in suggested pip message for tpu cluster
 resolver.

PiperOrigin-RevId: 199362908
---
 .../cluster_resolver/python/training/tpu_cluster_resolver.py    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index d44e23aadc..a5a9630a4a 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -173,7 +173,7 @@ class TPUClusterResolver(ClusterResolver):
         raise ImportError('googleapiclient and oauth2client must be installed '
                           'before using the TPU cluster resolver. Execute: '
                           '`pip install --upgrade google-api-python-client` '
-                          'and `pip install --upgrade oauth2lclient` to '
+                          'and `pip install --upgrade oauth2client` to '
                           'install with pip.')
 
       final_discovery_url = self._discoveryUrl() or discovery_url
-- 
GitLab


From 7638924989e42105000048af2af0b6cb8bc4956c Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Tue, 5 Jun 2018 15:19:24 -0700
Subject: [PATCH 336/610] Correctly implement the checks for getanno.

PiperOrigin-RevId: 199366963
---
 tensorflow/contrib/autograph/pyct/anno.py      | 2 +-
 tensorflow/contrib/autograph/pyct/anno_test.py | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/anno.py b/tensorflow/contrib/autograph/pyct/anno.py
index 81d5b93da1..ae861627fd 100644
--- a/tensorflow/contrib/autograph/pyct/anno.py
+++ b/tensorflow/contrib/autograph/pyct/anno.py
@@ -51,7 +51,7 @@ FAIL = object()
 
 def getanno(node, key, default=FAIL, field_name='___pyct_anno'):
   if (default is FAIL or
-      (hasattr(node, field_name) and getattr(node, field_name)[key])):
+      (hasattr(node, field_name) and (key in getattr(node, field_name)))):
     return getattr(node, field_name)[key]
   else:
     return default
diff --git a/tensorflow/contrib/autograph/pyct/anno_test.py b/tensorflow/contrib/autograph/pyct/anno_test.py
index d4caa3dd11..f2c0c8cf05 100644
--- a/tensorflow/contrib/autograph/pyct/anno_test.py
+++ b/tensorflow/contrib/autograph/pyct/anno_test.py
@@ -38,7 +38,8 @@ class AnnoTest(test.TestCase):
 
     anno.setanno(node, 'foo', 3)
     self.assertTrue(anno.hasanno(node, 'foo'))
-    self.assertEqual(3, anno.getanno(node, 'foo'))
+    self.assertEqual(anno.getanno(node, 'foo'), 3)
+    self.assertEqual(anno.getanno(node, 'bar', default=7), 7)
 
     anno.delanno(node, 'foo')
     self.assertFalse(anno.hasanno(node, 'foo'))
-- 
GitLab


From 0349be6b6f0af28b3446ab66ed578f691f8b054f Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 5 Jun 2018 14:28:20 -0700
Subject: [PATCH 337/610] contrib/eagerpython/datasets: Resource naming
 workaround.

tensorflow/contrib/eager/python/datasets_test.py was failing on GPU
because two tests - testTensorsPlacedOnDevice() and
testTensorsExplicitPrefetchToDevice() we're creating
FunctionBufferResources with the same shared_name, leading to
unintentional interference.

This change will make the tests pass and allow the use of
tf.contrib.eager.Iterator and
tf.data.Dataset.apply(prefetching_ops.prefetch_to_device)
in the same process without interference.

However, a more appropriate fix would probably be to use
anonymous function buffering resources (similar to
AnonymousIteratorHandle) when eager execution is enabled,
doing away with sharing by name.
---
 tensorflow/contrib/eager/python/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index d7909dd5a2..adf92c27ea 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -106,7 +106,8 @@ class Iterator(iterator_ops.EagerIterator, checkpointable.CheckpointableBase):
             target_device=target,
             buffer_size=10,
             container="",
-            shared_name=_generate_shared_name("function_buffer_resource"))
+            shared_name=_generate_shared_name(
+                "contrib_eager_iterator_function_buffer_resource"))
         self._buffer_resource_deleter = resource_variable_ops.EagerResourceDeleter(  # pylint: disable=line-too-long
             handle=self._buffer_resource_handle,
             handle_device=self._device)
-- 
GitLab


From 5f8da6dd1e90e2c369f088f80c79c87b6dc8c0da Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 11:11:16 -0700
Subject: [PATCH 338/610] Fixing the adamax_test rtol to be more lenient.

---
 tensorflow/contrib/opt/python/training/adamax_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index 21bf3f5313..a059aae130 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,8 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1), rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
-- 
GitLab


From dbe7fd6840d77364485064b2e23664133c7063c6 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 11:31:55 -0700
Subject: [PATCH 339/610] Fixing line too long.

---
 tensorflow/contrib/opt/python/training/adamax_test.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/opt/python/training/adamax_test.py b/tensorflow/contrib/opt/python/training/adamax_test.py
index a059aae130..915e6504e1 100644
--- a/tensorflow/contrib/opt/python/training/adamax_test.py
+++ b/tensorflow/contrib/opt/python/training/adamax_test.py
@@ -224,8 +224,10 @@ class AdaMaxOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0), rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1), rtol=1e-2)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
           if use_resource:
             self.assertEqual("var0_%d/AdaMax:0" % (i,),
                              opt.get_slot(var=var0, name="m").name)
-- 
GitLab


From 3edabec18a47e41f2cfc71d4e3a4280b77881f83 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 10:28:38 -0700
Subject: [PATCH 340/610] Change order of installations.

---
 tensorflow/tools/ci_build/install/install_pip_packages.sh  | 7 ++++---
 .../ci_build/install/install_python3.5_pip_packages.sh     | 4 +++-
 .../ci_build/install/install_python3.6_pip_packages.sh     | 4 +++-
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index bd6c50bce9..dba2dfc490 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -21,9 +21,6 @@ set -e
 easy_install -U pip==9.0.3
 easy_install3 -U pip==9.0.3
 
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
-
 # Install pip packages from whl files to avoid the time-consuming process of
 # building from source.
 
@@ -57,6 +54,10 @@ pip3 install --upgrade markdown==2.6.8
 pip2 install --upgrade protobuf==3.3.0
 pip3 install --upgrade protobuf==3.3.0
 
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
+
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index 0844c48980..e1978cd7d8 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -39,7 +39,6 @@ if [[ -z $pip35_version ]]; then
 fi
 
 set -e
-pip3.5 install --upgrade setuptools==39.1.0
 pip3.5 install --upgrade pip
 
 pip3.5 install --upgrade virtualenv
@@ -51,6 +50,9 @@ pip3.5 install --upgrade six==1.10.0
 # Install protobuf.
 pip3.5 install --upgrade protobuf==3.3.0
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index fb183b0e4f..0ffb8e67a4 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -49,7 +49,6 @@ cd Python-3.6.1
 make altinstall
 ln -s /usr/local/bin/pip3.6 /usr/local/bin/pip3
 
-pip3 install --upgrade setuptools==39.1.0
 pip3 install --upgrade pip
 
 pip3 install --upgrade virtualenv
@@ -63,6 +62,9 @@ pip3 install --upgrade six==1.10.0
 # Install protobuf.
 pip3 install --upgrade protobuf==3.3.0
 
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
-- 
GitLab


From 2080782ad2323a496847e526056b7d32153881a1 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 10:31:47 -0700
Subject: [PATCH 341/610] Making setuptools the last install to ensure it's
 accurate.

---
 tensorflow/tools/ci_build/install/install_pip_packages.sh | 8 ++++----
 .../ci_build/install/install_python3.5_pip_packages.sh    | 6 +++---
 .../ci_build/install/install_python3.6_pip_packages.sh    | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/tensorflow/tools/ci_build/install/install_pip_packages.sh b/tensorflow/tools/ci_build/install/install_pip_packages.sh
index dba2dfc490..b3d3f23ec8 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages.sh
@@ -54,10 +54,6 @@ pip3 install --upgrade markdown==2.6.8
 pip2 install --upgrade protobuf==3.3.0
 pip3 install --upgrade protobuf==3.3.0
 
-# Install last working version of setuptools.
-pip2 install --upgrade setuptools==39.1.0
-pip3 install --upgrade setuptools==39.1.0
-
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
@@ -113,3 +109,7 @@ pip2 install --upgrade gast
 pip3 install --upgrade gast
 pip2 install --upgrade termcolor
 pip3 install --upgrade termcolor
+
+# Install last working version of setuptools.
+pip2 install --upgrade setuptools==39.1.0
+pip3 install --upgrade setuptools==39.1.0
diff --git a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
index e1978cd7d8..61d34c7304 100755
--- a/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh
@@ -50,9 +50,6 @@ pip3.5 install --upgrade six==1.10.0
 # Install protobuf.
 pip3.5 install --upgrade protobuf==3.3.0
 
-# Install last working version of setuptools.
-pip3.5 install --upgrade setuptools==39.1.0
-
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
@@ -84,4 +81,7 @@ pip3.5 install --upgrade astor
 pip3.5 install --upgrade gast
 pip3.5 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3.5 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh)
diff --git a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
index 0ffb8e67a4..fe2d2cf11c 100755
--- a/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
+++ b/tensorflow/tools/ci_build/install/install_python3.6_pip_packages.sh
@@ -62,9 +62,6 @@ pip3 install --upgrade six==1.10.0
 # Install protobuf.
 pip3 install --upgrade protobuf==3.3.0
 
-# Install last working version of setuptools.
-pip3 install --upgrade setuptools==39.1.0
-
 # Remove obsolete version of six, which can sometimes confuse virtualenv.
 rm -rf /usr/lib/python3/dist-packages/six*
 
@@ -100,4 +97,7 @@ pip3 install --upgrade astor
 pip3 install --upgrade gast
 pip3 install --upgrade termcolor
 
+# Install last working version of setuptools.
+pip3 install --upgrade setuptools==39.1.0
+
 # LINT.ThenChange(//tensorflow/tools/ci_build/install/install_python3.5_pip_packages.sh)
-- 
GitLab


From 5c1c4fc8384595e663c970de29fa2374366eb15d Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Tue, 5 Jun 2018 15:47:59 -0700
Subject: [PATCH 342/610] Move fold-transpose and fold-conjugate optimizations
 into stages.

PiperOrigin-RevId: 199371452
---
 .../optimizers/arithmetic_optimizer.cc        | 307 +++++++++---------
 .../optimizers/arithmetic_optimizer.h         |  21 +-
 .../optimizers/arithmetic_optimizer_test.cc   | 120 ++++---
 3 files changed, 239 insertions(+), 209 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 2408652c87..44a14ef7eb 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -101,38 +101,6 @@ bool ValuesFromConstNode(const NodeDef& node, std::vector<T>* values) {
   return false;
 }
 
-template <typename T>
-bool IsInnerMatrixTranspose(const std::vector<T>& perm) {
-  const T n = perm.size();
-  if (n < 2) {
-    return false;
-  }
-  for (T i = 0; i < n - 2; ++i) {
-    if (perm[i] != i) {
-      return false;
-    }
-  }
-  return perm[n - 1] == n - 2 && perm[n - 2] == n - 1;
-}
-
-bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
-                                const NodeMap* node_map) {
-  if (transpose_node.op() != "Transpose" &&
-      transpose_node.op() != "ConjugateTranspose") {
-    return false;
-  }
-  const NodeDef* perm_node = node_map->GetNode(transpose_node.input(1));
-  std::vector<int> perm32;
-  if (ValuesFromConstNode(*perm_node, &perm32)) {
-    return IsInnerMatrixTranspose(perm32);
-  }
-  std::vector<int64> perm64;
-  if (ValuesFromConstNode(*perm_node, &perm64)) {
-    return IsInnerMatrixTranspose(perm64);
-  }
-  return false;
-}
-
 bool MaybeAddControlInput(const string& new_input, NodeDef* node,
                           GraphDef* graph, NodeMap* node_map) {
   bool already_exists = false;
@@ -155,12 +123,6 @@ void SetDataTypeToAttr(DataType dtype, const string& attr_name, NodeDef* node) {
   (*node->mutable_attr())[attr_name].set_type(dtype);
 }
 
-void FlipBooleanAttr(const string& attr_name, NodeDef* node) {
-  const bool old_value =
-      !node->attr().count(attr_name) ? false : node->attr().at(attr_name).b();
-  (*node->mutable_attr())[attr_name].set_b(!old_value);
-}
-
 string SourceDataTypeAttrName(const NodeDef& node) {
   if (node.op() == "Bitcast") {
     return "T";
@@ -2079,6 +2041,153 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
   }
 };
 
+// Fold Transpose into matrix multiplication.
+class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
+ public:
+  explicit FoldTransposeIntoMatMul(const GraphOptimizerContext& ctx,
+                                   const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FoldTransposeIntoMatMul", ctx, ctx_ext) {}
+  ~FoldTransposeIntoMatMul() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsMatMul(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeScopeAndName matmul = ParseNodeScopeAndName(node->name());
+    const string optimized_node_name = OptimizedNodeName(matmul);
+    if (ctx().node_map->NodeExists(optimized_node_name)) return Status::OK();
+
+    NodeDef* a;
+    NodeDef* b;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &a));
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &b));
+
+    bool is_complex = false;
+    if (node->op() != "SparseMatMul") {
+      const DataType type = GetDataTypeFromAttr(*node, "T");
+      is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
+    }
+
+    const std::set<string> foldable_transpose_ops =
+        !is_complex ? std::set<string>{"ConjugateTranspose", "Transpose"}
+                    : (node->op() == "BatchMatMul"
+                           ? std::set<string>{"ConjugateTranspose"}
+                           : std::set<string>{"Transpose"});
+
+    const bool a_is_foldable = foldable_transpose_ops.count(a->op()) > 0 &&
+                               IsInnerMatrixTransposeNode(*a, ctx().node_map);
+    const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
+                               IsInnerMatrixTransposeNode(*b, ctx().node_map);
+    if (!a_is_foldable && !b_is_foldable) return Status::OK();
+
+    NodeDef* new_op = AddCopyNode(optimized_node_name, node);
+
+    if (a_is_foldable) {
+      const string attr_a =
+          node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
+      FlipBooleanAttr(attr_a, new_op);
+      new_op->set_input(0, a->input(0));
+      ctx().node_map->UpdateInput(new_op->name(), a->name(), a->input(0));
+    }
+
+    if (b_is_foldable) {
+      const string attr_b =
+          node->op() == "BatchMatMul" ? "adj_y" : "transpose_b";
+      FlipBooleanAttr(attr_b, new_op);
+      new_op->set_input(1, b->input(0));
+      ctx().node_map->UpdateInput(new_op->name(), b->name(), b->input(0));
+    }
+
+    std::vector<const NodeDef*> deps_to_forward = {node};
+    if (a_is_foldable) deps_to_forward.push_back(a);
+    if (b_is_foldable) deps_to_forward.push_back(b);
+    ForwardControlDependencies(new_op, deps_to_forward);
+
+    return Status::OK();
+  }
+
+ private:
+  void FlipBooleanAttr(const string& attr_name, NodeDef* node) {
+    const bool old_value =
+        !node->attr().count(attr_name) ? false : node->attr().at(attr_name).b();
+    (*node->mutable_attr())[attr_name].set_b(!old_value);
+  }
+
+  template <typename T>
+  bool IsInnerMatrixTranspose(const std::vector<T>& perm) {
+    const T n = perm.size();
+    if (n < 2) {
+      return false;
+    }
+    for (T i = 0; i < n - 2; ++i) {
+      if (perm[i] != i) {
+        return false;
+      }
+    }
+    return perm[n - 1] == n - 2 && perm[n - 2] == n - 1;
+  }
+
+  bool IsInnerMatrixTransposeNode(const NodeDef& transpose_node,
+                                  const NodeMap* node_map) {
+    if (transpose_node.op() != "Transpose" &&
+        transpose_node.op() != "ConjugateTranspose") {
+      return false;
+    }
+    const NodeDef* perm_node = node_map->GetNode(transpose_node.input(1));
+    std::vector<int> perm32;
+    if (ValuesFromConstNode(*perm_node, &perm32)) {
+      return IsInnerMatrixTranspose(perm32);
+    }
+    std::vector<int64> perm64;
+    if (ValuesFromConstNode(*perm_node, &perm64)) {
+      return IsInnerMatrixTranspose(perm64);
+    }
+    return false;
+  }
+};
+
+// Fold Transpose into matrix multiplication.
+class FoldConjugateIntoTranspose : public ArithmeticOptimizerStage {
+ public:
+  explicit FoldConjugateIntoTranspose(const GraphOptimizerContext& ctx,
+                                      const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("FoldConjugateIntoTranspose", ctx, ctx_ext) {}
+  ~FoldConjugateIntoTranspose() override = default;
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsConj(*node) || IsTranspose(*node) || IsConjugateTranspose(*node);
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const NodeScopeAndName matmul = ParseNodeScopeAndName(node->name());
+    const string optimized_node_name = OptimizedNodeName(matmul);
+    if (ctx().node_map->NodeExists(optimized_node_name)) return Status::OK();
+
+    NodeDef* input;
+    TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
+
+    const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
+    const NodeDef* conj_op = node->op() == "Conj" ? node : input;
+
+    if ((IsTranspose(*transpose_op) || IsConjugateTranspose(*transpose_op)) &&
+        IsConj(*conj_op)) {
+      NodeDef* new_op = AddCopyNode(optimized_node_name, transpose_op);
+
+      // Flip the type of transpose op to absorb the conjugation.
+      new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
+                                                       : "Transpose");
+      new_op->set_input(0, input->input(0));
+      ctx().node_map->UpdateInput(new_op->name(), node->name(),
+                                  input->input(0));
+      ForwardControlDependencies(new_op, {node, input});
+      *simplified_node_name = new_op->name();
+    }
+
+    return Status::OK();
+  }
+};
+
 // Replace Mul node with identical inputs with a Square.
 class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
  public:
@@ -2323,33 +2432,6 @@ bool UniqueNodes::SameNode(const NodeDef& node1, const NodeDef& node2) const {
   return true;
 }
 
-NodeDef* ArithmeticOptimizer::AddNode(const NodeDef& node, StringPiece suffix,
-                                      bool copy_node) {
-  return AddNode(OptimizedNodeName(node, suffix), copy_node ? &node : nullptr);
-}
-
-NodeDef* ArithmeticOptimizer::AddNode(const string& name,
-                                      const NodeDef* node_to_copy) {
-  NodeDef* new_node = optimized_graph_->add_node();
-  node_map_->AddNode(NodeName(name), new_node);
-  if (node_to_copy != nullptr) {
-    *new_node = *node_to_copy;
-  }
-  new_node->set_name(name);
-  return new_node;
-}
-
-string ArithmeticOptimizer::OptimizedNodeName(const NodeDef& node,
-                                              StringPiece suffix) const {
-  return AddPrefixToNodeName(strings::StrCat(node.name(), "_", suffix),
-                             kArithmeticOptimizer);
-}
-
-bool ArithmeticOptimizer::OptimizedNodeExists(const NodeDef& node,
-                                              StringPiece suffix) const {
-  return node_map_->NodeExists(OptimizedNodeName(node, suffix));
-}
-
 namespace {
 
 bool FeedsInPlaceOp(const SimpleGraphView& graph_view, const NodeDef& node) {
@@ -2473,83 +2555,6 @@ void ArithmeticOptimizer::ForwardControlDependencies(
   DedupControlInputs(target_node);
 }
 
-// TODO(ezhulenev): extract each individual simplify rewrite into separate
-// ArithmeticOptimizerStage
-string ArithmeticOptimizer::TrySimplifyAndReplaceUses(
-    const NodeDef* node, SetVector<NodeDef*>* nodes_to_simplify) {
-  // Fold Transpose into matrix multiplication.
-  if ((node->op() == "MatMul" || node->op() == "SparseMatMul" ||
-       node->op() == "BatchMatMul") &&
-      !OptimizedNodeExists(*node, "fused")) {
-    const NodeDef* a = node_map_->GetNode(node->input(0));
-    const NodeDef* b = node_map_->GetNode(node->input(1));
-    bool is_complex = false;
-    if (node->op() != "SparseMatMul") {
-      const DataType type = GetDataTypeFromAttr(*node, "T");
-      is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
-    }
-    const std::set<string> foldable_transpose_ops =
-        !is_complex ? std::set<string>{"ConjugateTranspose", "Transpose"}
-                    : (node->op() == "BatchMatMul"
-                           ? std::set<string>{"ConjugateTranspose"}
-                           : std::set<string>{"Transpose"});
-    const bool a_is_foldable = foldable_transpose_ops.count(a->op()) > 0 &&
-                               IsInnerMatrixTransposeNode(*a, node_map_.get());
-    const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
-                               IsInnerMatrixTransposeNode(*b, node_map_.get());
-    if (a_is_foldable || b_is_foldable) {
-      NodeDef* new_op = AddNode(*node, "fused", /*copy_node=*/true);
-      if (a_is_foldable) {
-        const string attr_a =
-            node->op() == "BatchMatMul" ? "adj_x" : "transpose_a";
-        FlipBooleanAttr(attr_a, new_op);
-        new_op->set_input(0, a->input(0));
-        node_map_->UpdateInput(new_op->name(), a->name(), a->input(0));
-      }
-      if (b_is_foldable) {
-        const string attr_b =
-            node->op() == "BatchMatMul" ? "adj_y" : "transpose_b";
-        FlipBooleanAttr(attr_b, new_op);
-        new_op->set_input(1, b->input(0));
-        node_map_->UpdateInput(new_op->name(), b->name(), b->input(0));
-      }
-      std::vector<const NodeDef*> deps_to_forward({node});
-      if (a_is_foldable) {
-        deps_to_forward.push_back(a);
-      }
-      if (b_is_foldable) {
-        deps_to_forward.push_back(b);
-      }
-      ForwardControlDependencies(new_op, deps_to_forward);
-    }
-  }
-
-  // Fold Conj into Transpose or ConjugateTranspose.
-  if ((node->op() == "Conj" || node->op() == "Transpose" ||
-       node->op() == "ConjugateTranspose") &&
-      !OptimizedNodeExists(*node, "fused")) {
-    const NodeDef* input = node_map_->GetNode(node->input(0));
-    const NodeDef* transpose_op = node->op() == "Conj" ? input : node;
-    const NodeDef* conj_op = node->op() == "Conj" ? node : input;
-
-    if ((transpose_op->op() == "Transpose" ||
-         transpose_op->op() == "ConjugateTranspose") &&
-        conj_op->op() == "Conj") {
-      NodeDef* new_op =
-          AddNode(OptimizedNodeName(*node, "fused"), transpose_op);
-      // Flip the type of transpose op to absorb the conjugation.
-      new_op->set_op(transpose_op->op() == "Transpose" ? "ConjugateTranspose"
-                                                       : "Transpose");
-      new_op->set_input(0, input->input(0));
-      node_map_->UpdateInput(new_op->name(), node->name(), input->input(0));
-      ForwardControlDependencies(new_op, {node, input});
-      return new_op->name();
-    }
-  }
-
-  return "";
-}
-
 Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   SetVector<NodeDef*> nodes_to_simplify;
   nodes_to_simplify.Reserve(optimized_graph_->node_size());
@@ -2567,8 +2572,12 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
 
   if (options_.combine_add_to_addn && can_use_shapes)
     pipeline.AddStage<AddOpsRewriteStage>(ctx, ctx_ext);
+  if (options_.fold_conjugate_into_transpose)
+    pipeline.AddStage<FoldConjugateIntoTranspose>(ctx, ctx_ext);
   if (options_.fold_multiply_into_conv)
     pipeline.AddStage<FoldMultiplyIntoConv>(ctx, ctx_ext);
+  if (options_.fold_transpose_into_matmul)
+    pipeline.AddStage<FoldTransposeIntoMatMul>(ctx, ctx_ext);
   if (options_.hoist_common_factor_out_of_aggregation && can_use_shapes)
     pipeline.AddStage<HoistCommonFactorOutOfAggregation>(ctx, ctx_ext);
   if (options_.minimize_broadcasts && can_use_shapes)
@@ -2606,19 +2615,11 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
   while (!nodes_to_simplify.Empty()) {
     NodeDef* node = nodes_to_simplify.PopBack();
 
-    // TODO(ezhulenev): move all rewrites into separate stages
     string simplified_tensor = "";
-    if (options_.enable_try_simplify_and_replace) {
-      simplified_tensor = TrySimplifyAndReplaceUses(node, &nodes_to_simplify);
-    }
+    bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
 
-    // if it was not simplified try to run it through all configured stages
-    if (!stop(simplified_tensor)) {
-      bool optimized = pipeline.PassThroughAllStages(node, &simplified_tensor);
-      if (!optimized) {
-        continue;
-      }
-    }
+    // If the node was not optimized by any of the stages, go to the next one.
+    if (!optimized) continue;
 
     // re-wire consumers of an old node to the new one
     if (NodeName(simplified_tensor) != node->name()) {
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index 549ea3fde5..f37458eba4 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -54,14 +54,12 @@ class ArithmeticOptimizer : public GraphOptimizer {
 
   // Granular control for arithmetic optimizer stages
   struct ArithmeticOptimizerOptions {
-    // TODO(ezhulenev): flag do disable TrySimplifyAndReplaceUses in tests.
-    // Remove when all optimizers will be migrated to separate stages.
-    bool enable_try_simplify_and_replace = true;
-
     bool combine_add_to_addn = true;
     bool convert_sqrt_div_to_rsqrt_mul = true;
     bool dedup_computations = true;
+    bool fold_conjugate_into_transpose = true;
     bool fold_multiply_into_conv = true;
+    bool fold_transpose_into_matmul = true;
     bool hoist_common_factor_out_of_aggregation = true;
     bool hoist_cwise_unary_chains = false;
     bool minimize_broadcasts = true;
@@ -86,21 +84,6 @@ class ArithmeticOptimizer : public GraphOptimizer {
     }
   };
 
-  // Returns true is a node with given name and the optimizer prefix already
-  // exists.
-  string OptimizedNodeName(const NodeDef& node, StringPiece suffix) const;
-  bool OptimizedNodeExists(const NodeDef& node, StringPiece suffix) const;
-
-  // Creates a new node in the graph, with name equal to that of node, prefixed
-  // with "ArithmeticOptimizer/" and the given suffix. Also updates node_map_,
-  // and optionally copies node into the new node if copy_node is true.
-  NodeDef* AddNode(const NodeDef& node, StringPiece suffix, bool copy_node);
-
-  // Creates a new node in the graph, prefixed with "ArithmeticOptimizer/",
-  // updates node_map_, and optionally copies *node_to_copy into the new
-  // node, if node_to_copy is not nullptr.
-  NodeDef* AddNode(const string& name, const NodeDef* node_to_copy);
-
   // Returns true if it is safe to dedup node from the graph.
   bool CanDedup(const NodeDef& node) const;
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index f79347cde6..8083b6051f 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -139,10 +139,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
   void DisableAllStages(ArithmeticOptimizer* optimizer) {
     ArithmeticOptimizer::ArithmeticOptimizerOptions options;
     options.dedup_computations = false;
-    options.enable_try_simplify_and_replace = false;
     options.combine_add_to_addn = false;
     options.convert_sqrt_div_to_rsqrt_mul = false;
+    options.fold_conjugate_into_transpose = false;
     options.fold_multiply_into_conv = false;
+    options.fold_transpose_into_matmul = false;
     options.hoist_common_factor_out_of_aggregation = false;
     options.hoist_cwise_unary_chains = false;
     options.minimize_broadcasts = false;
@@ -169,11 +170,21 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.combine_add_to_addn = true;
   }
 
+  void EnableOnlyFoldConjugateIntoTranspose(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_conjugate_into_transpose = true;
+  }
+
   void EnableOnlyFoldMultipleIntoConv(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.fold_multiply_into_conv = true;
   }
 
+  void EnableOnlyFoldTransposeIntoMatMul(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.fold_transpose_into_matmul = true;
+  }
+
   void EnableOnlyHoistCommonFactor(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.hoist_common_factor_out_of_aggregation = true;
@@ -845,11 +856,14 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp = ops::Transpose(s.WithOpName("trans"), conj, perm);
+
   GrapplerItem item;
+  item.fetch = {"trans"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"trans"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
+
   ArithmeticOptimizer optimizer;
   GraphDef output;
   OptimizeTwice(&optimizer, &item, &output);
@@ -857,20 +871,23 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndTranspose) {
 
   EXPECT_EQ(7, output.node_size());
 
-  const NodeDef* trans_fused_node =
-      node_map.GetNode(OptimizedName("trans_fused"));
+  const string p = "ArithmeticOptimizer/FoldConjugateIntoTranspose";
+  const string optimized_name = strings::StrCat(p, "_", "trans");
+
+  const NodeDef* trans_fused_node = node_map.GetNode(optimized_name);
   ASSERT_NE(trans_fused_node, nullptr);
   EXPECT_EQ("ConjugateTranspose", trans_fused_node->op());
   EXPECT_EQ("z", trans_fused_node->input(0));
   EXPECT_EQ("perm", trans_fused_node->input(1));
 
-  auto tensors = EvaluateNodes(output, fetch);
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
 
 TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
   Output re = ops::Const(s.WithOpName("re"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
   Output im = ops::Const(s.WithOpName("im"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
   Output z = ops::Complex(s.WithOpName("z"), re, im);
@@ -878,10 +895,12 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
   Output conj = ops::Conj(s.WithOpName("conj"), z);
   Output transp =
       ops::ConjugateTranspose(s.WithOpName("conjugate_trans"), conj, perm);
+
   GrapplerItem item;
+  item.fetch = {"conjugate_trans"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"conjugate_trans"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -891,12 +910,16 @@ TEST_F(ArithmeticOptimizerTest, FuseConjAndConjugateTranspose) {
 
   EXPECT_EQ(7, output.node_size());
 
-  const NodeDef* conjugate_trans_fused_node =
-      node_map.GetNode(OptimizedName("conjugate_trans_fused"));
+  const string p = "ArithmeticOptimizer/FoldConjugateIntoTranspose";
+  const string optimized_name = strings::StrCat(p, "_", "conjugate_trans");
+
+  const NodeDef* conjugate_trans_fused_node = node_map.GetNode(optimized_name);
+  ASSERT_NE(conjugate_trans_fused_node, nullptr);
   EXPECT_EQ("Transpose", conjugate_trans_fused_node->op());
   EXPECT_EQ("z", conjugate_trans_fused_node->input(0));
   EXPECT_EQ("perm", conjugate_trans_fused_node->input(1));
-  auto tensors = EvaluateNodes(output, fetch);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
@@ -909,10 +932,12 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
   Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
   Output trans = ops::Transpose(s.WithOpName("trans"), z, perm);
   Output conj = ops::Conj(s.WithOpName("conj"), trans);
+
   GrapplerItem item;
+  item.fetch = {"conj"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"conj"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
@@ -922,12 +947,16 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
 
   EXPECT_EQ(7, output.node_size());
 
-  const NodeDef* conj_fused_node =
-      node_map.GetNode(OptimizedName("conj_fused"));
+  const string p = "ArithmeticOptimizer/FoldConjugateIntoTranspose";
+  const string optimized_name = strings::StrCat(p, "_", "conj");
+
+  const NodeDef* conj_fused_node = node_map.GetNode(optimized_name);
+  ASSERT_NE(conj_fused_node, nullptr);
   EXPECT_EQ("ConjugateTranspose", conj_fused_node->op());
   EXPECT_EQ("z", conj_fused_node->input(0));
   EXPECT_EQ("perm", conj_fused_node->input(1));
-  auto tensors = EvaluateNodes(output, fetch);
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorEqual<complex64>(tensors_expected[0], tensors[0]);
 }
@@ -935,38 +964,45 @@ TEST_F(ArithmeticOptimizerTest, FuseTransposeAndConj) {
 TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
   for (const string matmul_type : {"MatMul", "SparseMatMul", "BatchMatMul"}) {
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
     Output a = ops::Const(s.WithOpName("a"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
     Output b = ops::Const(s.WithOpName("b"), {5.0f, 6.0f, 7.0f, 8.0f}, {2, 2});
     Output perm = ops::Const(s.WithOpName("perm"), {1, 0}, {2});
     Output trans_a = ops::Transpose(s.WithOpName("trans_a"), a, perm);
     Output trans_b = ops::Transpose(s.WithOpName("trans_b"), b, perm);
+
+    auto matmul_op = s.WithOpName("matmul");
     if (matmul_type == "MatMul") {
-      Output matmul = ops::MatMul(s.WithOpName("matmul"), trans_a, trans_b);
+      Output matmul = ops::MatMul(matmul_op, trans_a, trans_b);
     } else if (matmul_type == "SparseMatMul") {
-      Output matmul =
-          ops::SparseMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+      Output matmul = ops::SparseMatMul(matmul_op, trans_a, trans_b);
     } else if (matmul_type == "BatchMatMul") {
-      Output matmul =
-          ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+      Output matmul = ops::BatchMatMul(matmul_op, trans_a, trans_b);
     }
+
     GrapplerItem item;
+    item.fetch = {"matmul"};
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
-    std::vector<string> fetch = {"matmul"};
-    auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
     EXPECT_EQ(1, tensors_expected.size());
 
     ArithmeticOptimizer optimizer;
+    EnableOnlyFoldTransposeIntoMatMul(&optimizer);
     GraphDef output;
     OptimizeTwice(&optimizer, &item, &output);
     NodeMap node_map(&output);
 
     EXPECT_EQ(7, output.node_size());
 
-    const NodeDef* matmul_fused_node =
-        node_map.GetNode(OptimizedName("matmul_fused"));
+    const string p = "ArithmeticOptimizer/FoldTransposeIntoMatMul";
+    const string optimized_name = strings::StrCat(p, "_", "matmul");
+
+    const NodeDef* matmul_fused_node = node_map.GetNode(optimized_name);
     ASSERT_NE(matmul_fused_node, nullptr);
     EXPECT_EQ("a", matmul_fused_node->input(0));
     EXPECT_EQ("b", matmul_fused_node->input(1));
+
     if (matmul_type == "BatchMatMul") {
       EXPECT_TRUE(matmul_fused_node->attr().at("adj_x").b());
       EXPECT_TRUE(matmul_fused_node->attr().at("adj_y").b());
@@ -974,7 +1010,8 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
       EXPECT_TRUE(matmul_fused_node->attr().at("transpose_a").b());
       EXPECT_TRUE(matmul_fused_node->attr().at("transpose_b").b());
     }
-    auto tensors = EvaluateNodes(output, fetch);
+
+    auto tensors = EvaluateNodes(output, item.fetch);
     EXPECT_EQ(1, tensors.size());
     test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
   }
@@ -982,6 +1019,7 @@ TEST_F(ArithmeticOptimizerTest, FoldTransposeIntoMatMul) {
 
 TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
   Output re_a =
       ops::Const(s.WithOpName("re_a"), {1.0f, 2.0f, 3.0f, 4.0f}, {2, 2});
   Output im_a =
@@ -996,24 +1034,32 @@ TEST_F(ArithmeticOptimizerTest, FoldConjugateTransposeIntoBatchMatMul) {
   Output trans_a = ops::ConjugateTranspose(s.WithOpName("trans_a"), a, perm);
   Output trans_b = ops::ConjugateTranspose(s.WithOpName("trans_b"), b, perm);
   Output matmul = ops::BatchMatMul(s.WithOpName("matmul"), trans_a, trans_b);
+
   GrapplerItem item;
+  item.fetch = {"matmul"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  std::vector<string> fetch = {"matmul"};
-  auto tensors_expected = EvaluateNodes(item.graph, fetch);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
 
   ArithmeticOptimizer optimizer;
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  OptimizeTwice(&optimizer, &item, &output);
 
-  EXPECT_EQ(11, output.node_size());
-  EXPECT_EQ(OptimizedName("matmul_fused"), output.node(10).name());
-  EXPECT_EQ("a", output.node(10).input(0));
-  EXPECT_EQ("b", output.node(10).input(1));
-  EXPECT_TRUE(output.node(10).attr().at("adj_x").b());
-  EXPECT_TRUE(output.node(10).attr().at("adj_y").b());
-  auto tensors = EvaluateNodes(output, fetch);
+  NodeMap node_map(&output);
+  ASSERT_EQ(11, output.node_size());
+
+  const string p = "ArithmeticOptimizer/FoldTransposeIntoMatMul";
+  const string optimized_name = strings::StrCat(p, "_", "matmul");
+
+  const NodeDef* optimized_matmul = node_map.GetNode(optimized_name);
+  ASSERT_NE(optimized_matmul, nullptr);
+  EXPECT_EQ("a", optimized_matmul->input(0));
+  EXPECT_EQ("b", optimized_matmul->input(1));
+  EXPECT_TRUE(optimized_matmul->attr().at("adj_x").b());
+  EXPECT_TRUE(optimized_matmul->attr().at("adj_y").b());
+
+  auto tensors = EvaluateNodes(output, item.fetch);
   EXPECT_EQ(1, tensors.size());
   test::ExpectTensorNear<complex64>(tensors_expected[0], tensors[0], 1e-6);
 }
-- 
GitLab


From e54546349e1ec58c985e508bf5442cde24c11da0 Mon Sep 17 00:00:00 2001
From: Yifei Feng <yifeif@google.com>
Date: Tue, 5 Jun 2018 15:53:31 -0700
Subject: [PATCH 343/610] internal change

PiperOrigin-RevId: 199372205
---
 tensorflow/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index c2f7794c3b..86721cb856 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3524,6 +3524,7 @@ tf_py_wrap_cc(
         "util/transform_graph.i",
         "util/util.i",
     ],
+    # add win_def_file
     win_def_file = select({
         "//tensorflow:windows": ":pywrap_tensorflow_filtered_def_file",
         "//conditions:default": None,
-- 
GitLab


From 73c3a8a5217a6b105acffe62165071f8aa740e9b Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Tue, 5 Jun 2018 15:59:04 -0700
Subject: [PATCH 344/610] Disable flaky test for now.

PiperOrigin-RevId: 199373124
---
 tensorflow/contrib/control_flow/python/cond_v2_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/control_flow/python/cond_v2_test.py b/tensorflow/contrib/control_flow/python/cond_v2_test.py
index c94f3a6584..166002ca7f 100644
--- a/tensorflow/contrib/control_flow/python/cond_v2_test.py
+++ b/tensorflow/contrib/control_flow/python/cond_v2_test.py
@@ -80,6 +80,7 @@ class NewCondTest(test.TestCase):
     self._testCond(true_fn, false_fn, [y])
 
   def testSecondDerivative(self):
+    self.skipTest("b/109758172")
     pred = array_ops.placeholder(dtypes.bool, name="pred")
     x = constant_op.constant(3.0, name="x")
 
-- 
GitLab


From ff5838b402ae0013d0dffd87c214eb61d4a750e4 Mon Sep 17 00:00:00 2001
From: KinmanLam <kinman.lam@gmail.com>
Date: Tue, 5 Jun 2018 16:01:53 -0700
Subject: [PATCH 345/610] Update install_linux.md (#19767)

* Update install_linux.md

This not necessary and adds confusion to the instruction outlined in the Docker session. The host machine only requires Nvidia drivers, not the CUDA toolkit or cuDNN.

The containers require CUDA toolkit and cuDNN to be present. See:
https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements

* Add link to the NVidia Docker requirements page.
---
 tensorflow/docs_src/install/install_linux.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/docs_src/install/install_linux.md b/tensorflow/docs_src/install/install_linux.md
index 3b9381625f..7b56b6a508 100644
--- a/tensorflow/docs_src/install/install_linux.md
+++ b/tensorflow/docs_src/install/install_linux.md
@@ -339,9 +339,7 @@ Docker will download the TensorFlow binary image the first time you launch it.
 
 #### GPU support
 
-Prior to installing TensorFlow with GPU support, ensure that your system meets all
-[NVIDIA software requirements](#NVIDIARequirements).  To launch a Docker container
-with NVidia GPU support, enter a command of the following format:
+To launch a Docker container with NVidia GPU support, enter a command of the following format (this [does not require any local CUDA installation](https://github.com/nvidia/nvidia-docker/wiki/CUDA#requirements)):
 
 <pre>
 $ <b>nvidia-docker run -it</b> <i>-p hostPort:containerPort TensorFlowGPUImage</i>
-- 
GitLab


From 3daa07aa2dde379388beb2a557a78bc5dd1b86ba Mon Sep 17 00:00:00 2001
From: Courtial Florian <floriancourtial@gmail.com>
Date: Wed, 6 Jun 2018 01:02:07 +0200
Subject: [PATCH 346/610] Add C++ no gradient for Floor operation. (#19662)

---
 tensorflow/cc/gradients/math_grad.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/cc/gradients/math_grad.cc b/tensorflow/cc/gradients/math_grad.cc
index 52c177212a..35a01e0341 100644
--- a/tensorflow/cc/gradients/math_grad.cc
+++ b/tensorflow/cc/gradients/math_grad.cc
@@ -38,6 +38,7 @@ REGISTER_NO_GRADIENT_OP("NotEqual");
 REGISTER_NO_GRADIENT_OP("LogicalAnd");
 REGISTER_NO_GRADIENT_OP("LogicalOr");
 REGISTER_NO_GRADIENT_OP("LogicalNot");
+REGISTER_NO_GRADIENT_OP("Floor");
 
 // Conjugate helper function returns the conjugate of an Output if it
 // is complex valued.
-- 
GitLab


From ece5f512538f66b69db52b8a5b6f9669ae10a3d9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 15:59:21 -0700
Subject: [PATCH 347/610] Only calls compare function if values were read from
 event file

PiperOrigin-RevId: 199373169
---
 tensorflow/python/estimator/exporter.py      |  7 ++--
 tensorflow/python/estimator/exporter_test.py | 34 ++++++++++++++++++++
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index f49ed05f57..5981fa59b7 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -360,9 +360,10 @@ class BestExporter(Exporter):
           for value in event.summary.value:
             if value.HasField('simple_value'):
               event_eval_result[value.tag] = value.simple_value
-          if best_eval_result is None or self._compare_fn(
-              best_eval_result, event_eval_result):
-            best_eval_result = event_eval_result
+          if event_eval_result:
+            if best_eval_result is None or self._compare_fn(
+                best_eval_result, event_eval_result):
+              best_eval_result = event_eval_result
     return best_eval_result
 
 
diff --git a/tensorflow/python/estimator/exporter_test.py b/tensorflow/python/estimator/exporter_test.py
index 4cb4bffc8d..c4b006955c 100644
--- a/tensorflow/python/estimator/exporter_test.py
+++ b/tensorflow/python/estimator/exporter_test.py
@@ -148,6 +148,40 @@ class BestExporterTest(test.TestCase):
                                     "checkpoint_path", {"loss": 20}, False)
     self.assertEqual(None, export_result)
 
+  def test_best_exporter_with_empty_event(self):
+
+    def _serving_input_receiver_fn():
+      pass
+
+    export_dir_base = tempfile.mkdtemp()
+    gfile.MkDir(export_dir_base)
+    gfile.MkDir(export_dir_base + "/export")
+    gfile.MkDir(export_dir_base + "/eval")
+
+    eval_dir_base = os.path.join(export_dir_base, "eval_continuous")
+    estimator_lib._write_dict_to_summary(eval_dir_base, {}, 1)
+    estimator_lib._write_dict_to_summary(eval_dir_base, {"loss": 60}, 2)
+
+    exporter = exporter_lib.BestExporter(
+        name="best_exporter",
+        serving_input_receiver_fn=_serving_input_receiver_fn,
+        event_file_pattern="eval_continuous/*.tfevents.*",
+        assets_extra={"from/path": "to/path"},
+        as_text=False,
+        exports_to_keep=1)
+
+    estimator = test.mock.Mock(spec=estimator_lib.Estimator)
+    estimator.model_dir = export_dir_base
+    estimator.export_savedmodel.return_value = "export_result_path"
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 100}, False)
+    self.assertEqual(None, export_result)
+
+    export_result = exporter.export(estimator, export_dir_base,
+                                    "checkpoint_path", {"loss": 10}, False)
+    self.assertEqual("export_result_path", export_result)
+
   def test_garbage_collect_exports(self):
     export_dir_base = tempfile.mkdtemp()
     gfile.MkDir(export_dir_base)
-- 
GitLab


From 677c83e6ba6fdc4d23f8c26bfc84209be4371631 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Tue, 5 Jun 2018 16:15:55 -0700
Subject: [PATCH 348/610] Updates Python TOCO command line and TOCO
 documentation.

PiperOrigin-RevId: 199375811
---
 .../contrib/lite/python/tflite_convert.py     | 11 ++---
 .../lite/toco/g3doc/cmdline_examples.md       | 45 -------------------
 .../lite/toco/g3doc/cmdline_reference.md      | 10 -----
 3 files changed, 6 insertions(+), 60 deletions(-)

diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index d0879daed2..6d77626a4b 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -161,7 +161,8 @@ def _check_flags(flags, unparsed):
       output += _get_message_unparsed(flag, "--input_file", "--graph_def_file")
       output += _get_message_unparsed(flag, "--std_value", "--std_dev_values")
       output += _get_message_unparsed(flag, "--batch_size", "--input_shapes")
-    raise ValueError(output)
+    if output:
+      raise ValueError(output)
 
   # Check that flags are valid.
   if flags.graph_def_file and (not flags.input_arrays or
@@ -285,13 +286,13 @@ def run_main(_):
   # Graph manipulation flags.
   parser.add_argument(
       "--drop_control_dependency",
-      type=bool,
+      action="store_true",
       help=("Boolean indicating whether to drop control dependencies silently. "
             "This is due to TensorFlow not supporting control dependencies. "
             "(default True)"))
   parser.add_argument(
       "--reorder_across_fake_quant",
-      type=bool,
+      action="store_true",
       help=("Boolean indicating whether to reorder FakeQuant nodes in "
             "unexpected locations. Used when the location of the FakeQuant "
             "nodes is preventing graph transformations necessary to convert "
@@ -300,13 +301,13 @@ def run_main(_):
             "behavior. (default False)"))
   parser.add_argument(
       "--change_concat_input_ranges",
-      type=bool,
+      action="store_true",
       help=("Boolean to change behavior of min/max ranges for inputs and "
             "outputs of the concat operator for quantized models. Changes the "
             "ranges of concat operator overlap when true. (default False)"))
   parser.add_argument(
       "--allow_custom_ops",
-      type=bool,
+      action="store_true",
       help=("Boolean indicating whether to allow custom operations. When false "
             "any unknown operation is an error. When true, custom ops are "
             "created for any op that is unknown. The developer will need to "
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
index 7680cdd344..8e93f02ef1 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_examples.md
@@ -26,8 +26,6 @@ Table of contents:
     *   [Convert a TensorFlow Lite FlatBuffer back into TensorFlow GraphDef
         format](#to-graphdef)
 *   [Logging](#logging)
-    *   [Standard logging](#standard-logging)
-    *   [Verbose logging](#verbose-logging)
     *   [Graph "video" logging](#graph-video-logging)
 *   [Graph visualizations](#graph-visualizations)
     *   [Using --output_format=GRAPHVIZ_DOT](#using-output-formatgraphviz-dot)
@@ -277,49 +275,6 @@ bazel run --config=opt \
 
 ## Logging
 
-### Standard logging
-
-The converter generates some informative log messages during processing. The
-easiest way to view them is to add `--logtostderr` to command lines as seen in
-the following example.
-
-```
-curl https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_0.50_128_frozen.tgz \
-  | tar xzv -C /tmp
-bazel run --config=opt \
-  //tensorflow/contrib/lite/toco:toco -- \
-  --input_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
-  --output_file=/tmp/foo.tflite \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
-  --inference_type=FLOAT \
-  --input_shape=1,128,128,3 \
-  --input_array=input \
-  --output_array=MobilenetV1/Predictions/Reshape_1 \
-  --logtostderr
-```
-
-After some initialization messages, we get the following informative messages:
-
-```
-I1101 21:51:33.297475    5339 graph_transformations.cc:39] Before general graph transformations: 416 operators, 583 arrays (0 quantized)
-I1101 21:51:33.308972    5339 graph_transformations.cc:39] After general graph transformations pass 1: 31 operators, 89 arrays (0 quantized)
-I1101 21:51:33.309204    5339 graph_transformations.cc:39] Before dequantization graph transformations: 31 operators, 89 arrays (0 quantized)
-I1101 21:51:33.309368    5339 allocate_transient_arrays.cc:312] Total transient array allocated size: 1048576 bytes, theoretical optimal value: 786432 bytes.
-I1101 21:51:33.309484    5339 toco_tooling.cc:249] Estimated count of arithmetic ops: 0.099218 billion (note that a multiply-add is counted as 2 ops).
-```
-
-### Verbose logging
-
-For debugging purposes, the converter supports two levels of verbose logging,
-which can be set by passing a `--v=` flag:
-
-*   For `--v=1`, the converter generates text dumps of the graph at various
-    points during processing as well as log messages about every graph
-    transformation that took place.
-*   For `--v=2`, the converter additionally generates log messages about graph
-    transformations that were considered but not performed.
-
 ### Graph "video" logging
 
 When `--dump_graphviz=` is used (see the section on [graph
diff --git a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
index a8381169b8..8085ae0748 100644
--- a/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
+++ b/tensorflow/contrib/lite/toco/g3doc/cmdline_reference.md
@@ -209,16 +209,6 @@ have.
 
 ## Logging flags
 
-The following are standard Google logging flags:
-
-*   `--logtostderr` redirects Google logging to standard error, typically making
-    it visible in a terminal.
-*   `--v` sets verbose logging levels (for debugging purposes). Defined levels:
-    *   `--v=1`: log all graph transformations that did make a change on the
-        graph.
-    *   `--v=2`: log all graph transformations that did *not* make a change on
-        the graph.
-
 The following flags allow to generate graph visualizations of the actual graph
 at various points during transformations:
 
-- 
GitLab


From 135a25971bfbac86b0aed2cf0433608966015c22 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Tue, 5 Jun 2018 16:22:14 -0700
Subject: [PATCH 349/610] Support uint8, int32 and int64 for SpaceToDepth in
 TOCO.

PiperOrigin-RevId: 199376731
---
 .../contrib/lite/testing/generate_examples.py       | 13 ++++++-------
 tensorflow/contrib/lite/toco/import_tensorflow.cc   |  9 ++++++++-
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 9bb7a4600d..351187f520 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -58,10 +58,11 @@ from tensorflow.python.ops import rnn
 parser = argparse.ArgumentParser(description="Script to generate TFLite tests.")
 parser.add_argument("output_path",
                     help="Directory where the outputs will be go.")
-parser.add_argument("--zip_to_output",
-                    type=str,
-                    help="Particular zip to output.",
-                    required=False)
+parser.add_argument(
+    "--zip_to_output",
+    type=str,
+    help="Particular zip to output.",
+    required=True)
 parser.add_argument("--toco",
                     type=str,
                     help="Path to toco tool.",
@@ -97,8 +98,6 @@ KNOWN_BUGS = {
     r"fully_connected.*transpose_.=True": "67586970",
     # Softmax graphs are too complex.
     r"softmax.*dim=0": "67749831",
-    # SpaceToDepth only supports float32.
-    r"space_to_depth.*(float16|int32|uint8|int64)": "68018134",
     # BatchToSpaceND only supports 4D tensors.
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
@@ -1621,7 +1620,7 @@ def make_space_to_depth_tests(zip_path):
   """Make a set of tests to do space_to_depth."""
 
   test_parameters = [{
-      "dtype": [tf.float32, tf.float16, tf.int32, tf.uint8, tf.int64],
+      "dtype": [tf.float32, tf.int32, tf.uint8, tf.int64],
       "input_shape": [[2, 12, 24, 1]],
       "block_size": [2, 3, 4],
   }]
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 0a57015d29..b9ebf66ff2 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -614,7 +614,14 @@ void ConvertSpaceToDepthOperator(const NodeDef& node,
   CHECK_EQ(node.op(), "SpaceToDepth");
   CheckInputsCount(node, tf_import_flags, 1);
 
-  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  tensorflow::DataType dtype = GetDataTypeAttr(node, "T");
+  if (dtype != DT_FLOAT && dtype != DT_UINT8 && dtype != DT_INT32 &&
+      dtype != DT_INT64) {
+    const auto* enum_descriptor = tensorflow::DataType_descriptor();
+    LOG(FATAL) << "TFLite does not support SpaceToDepth with type T:"
+               << enum_descriptor->FindValueByNumber(dtype)->name() << ". "
+               << "T must be one of {DT_FLOAT, DT_INT8, DT_INT32, DT_INT64}.";
+  }
   auto* op = new SpaceToDepthOperator;
   op->inputs.push_back(node.input(0));
   op->outputs.push_back(node.name());
-- 
GitLab


From a57f0de68685fb537eb390fa87f04dbafecb28ef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 16:23:23 -0700
Subject: [PATCH 350/610] [XLA] Make CrossReplicaSum support general cross
 replica reduce. Also change the interface to be able to describe the common
 AllReduce semantic.

PiperOrigin-RevId: 199376926
---
 .../xla/client/xla_client/xla_builder.cc      | 24 ++++++++++++++++-
 .../xla/client/xla_client/xla_builder.h       | 23 ++++++++++++++++
 .../bfloat16_conversion_folding_test.cc       | 15 +++++++++--
 .../service/bfloat16_normalization_test.cc    | 15 +++++++++--
 .../compiler/xla/service/buffer_assignment.cc |  1 +
 tensorflow/compiler/xla/service/call_graph.cc |  1 +
 .../xla/service/hlo_element_type_converter.cc |  1 +
 .../compiler/xla/service/hlo_instruction.cc   | 24 +++++++++++++----
 .../compiler/xla/service/hlo_instruction.h    | 22 ++++++++++++---
 tensorflow/compiler/xla/service/hlo_parser.cc |  5 +++-
 .../compiler/xla/service/hlo_parser_test.cc   | 18 +++++++++++++
 .../xla/tests/cross_replica_sum_test.cc       | 27 ++++++++++++++++---
 12 files changed, 159 insertions(+), 17 deletions(-)

diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
index ae506317c2..5e17cc4dfb 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.cc
@@ -1613,13 +1613,35 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
 
 XlaOp XlaBuilder::CrossReplicaSum(const XlaOp& operand) {
   return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    HloInstructionProto instr;
+    TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
+    const Shape& scalar_shape = ShapeUtil::MakeShape(shape.element_type(), {});
+    auto b = CreateSubBuilder("sum");
+    b->Add(b->Parameter(/*parameter_number=*/0, scalar_shape, "x"),
+           b->Parameter(/*parameter_number=*/1, scalar_shape, "y"));
+    TF_ASSIGN_OR_RETURN(auto computation, b->Build());
+    return CrossReplicaSum(operand, computation, /*replica_group_ids=*/{},
+                           /*channel_id=*/tensorflow::gtl::nullopt);
+  });
+}
+
+XlaOp XlaBuilder::CrossReplicaSum(
+    const XlaOp& operand, const XlaComputation& computation,
+    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
+    const tensorflow::gtl::optional<ChannelHandle>& channel_id) {
+  return NoteErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    if (!replica_group_ids.empty() || channel_id.has_value()) {
+      return Unimplemented(
+          "replica_group_ids and channel_id and is not supported in AllReduce");
+    }
 
+    HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
         *instr.mutable_shape(),
         ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
 
+    AddCalledComputation(computation, &instr);
+
     return AddInstruction(std::move(instr), HloOpcode::kCrossReplicaSum,
                           {operand});
   });
diff --git a/tensorflow/compiler/xla/client/xla_client/xla_builder.h b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
index 2b3013a91c..532cae0148 100644
--- a/tensorflow/compiler/xla/client/xla_client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_client/xla_builder.h
@@ -532,6 +532,29 @@ class XlaBuilder {
   // supply one input to the sum and all replicas receive the resulting sum.
   XlaOp CrossReplicaSum(const XlaOp& operand);
 
+  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
+  // AllReduce means doing a reduction on the input operand cross cores and then
+  // broadcasting the reduction result to those cores. The reduction function is
+  // defined by `computation`, which should be a commutative computation on
+  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
+  // configured by:
+  //
+  // - `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+  // replicas belong to one group. Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // - `channel_id`: for Allreduce nodes from different models, if they have the
+  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+  // applied cross models.
+  //
+  // TODO(b/79737069): Rename this to AllReduce when it's ready to use.
+  XlaOp CrossReplicaSum(
+      const XlaOp& operand, const XlaComputation& computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+      const tensorflow::gtl::optional<ChannelHandle>& channel_id =
+          tensorflow::gtl::nullopt);
+
   // Enqueues an operation that scatters the `source` array to the selected
   // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
diff --git a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
index 28e71c2054..7fd1e733e9 100644
--- a/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_conversion_folding_test.cc
@@ -211,6 +211,17 @@ TEST_F(BFloat16ConversionFoldingTest, DoNotFoldTuple) {
 
 TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   auto builder = HloComputation::Builder(TestName());
+
+  auto module = CreateNewModule();
+  HloComputation::Builder sum_builder("add");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y));
+  HloComputation* sum = module->AddEmbeddedComputation(sum_builder.Build());
+
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
 
@@ -223,7 +234,8 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
 
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b}));
+          ShapeUtil::MakeTupleShape({f32_shape, f32_shape}), {convert_a, b},
+          sum));
   HloInstruction* gte_a = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(f32_shape, crs, 0));
   HloInstruction* gte_b = builder.AddInstruction(
@@ -233,7 +245,6 @@ TEST_F(BFloat16ConversionFoldingTest, FoldCrossReplicaSumTupleOutput) {
   HloInstruction* tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({gte_a, convert_gte_b}));
 
-  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(FoldConversions(module.get()));
diff --git a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
index 1afaefd9df..9926661dd3 100644
--- a/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/bfloat16_normalization_test.cc
@@ -228,6 +228,17 @@ TEST_F(BFloat16NormalizationTest, ResolveUnsupportedMixedPrecisionReduce) {
 }
 
 TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
+  auto module = CreateNewModule();
+  HloComputation::Builder sum_builder("sum");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(F32, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(F32, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+
   auto builder = HloComputation::Builder(TestName());
   Shape f32_shape = ShapeUtil::MakeShape(F32, {2, 4});
   Shape bf16_shape = ShapeUtil::MakeShape(BF16, {2, 4});
@@ -239,11 +250,11 @@ TEST_F(BFloat16NormalizationTest, ResolveMixedPrecisionTupleCrossReplicaSum) {
 
   HloInstruction* crs =
       builder.AddInstruction(HloInstruction::CreateCrossReplicaSum(
-          ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b}));
+          ShapeUtil::MakeTupleShape({f32_shape, bf16_shape}), {a, b},
+          reduction));
   HloInstruction* gte = builder.AddInstruction(
       HloInstruction::CreateGetTupleElement(bf16_shape, crs, 1));
 
-  auto module = CreateNewModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_TRUE(Normalize(module.get()));
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index c0b8bf9039..682c386579 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -135,6 +135,7 @@ Status GatherComputationsByAllocationType(
             worklist.push_back(std::make_pair(subcomputation,
                                               false));  // Not thread local.
             break;
+          case HloOpcode::kCrossReplicaSum:
           case HloOpcode::kMap:
           case HloOpcode::kReduce:
           case HloOpcode::kReduceWindow:
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index a8053d15e1..a23427f00c 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -57,6 +57,7 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kConditional:
     case HloOpcode::kWhile:
       return CallContext::kSequential;
+    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kMap:
     case HloOpcode::kReduce:
     case HloOpcode::kReduceWindow:
diff --git a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
index abec29df43..4ed1508d70 100644
--- a/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
+++ b/tensorflow/compiler/xla/service/hlo_element_type_converter.cc
@@ -141,6 +141,7 @@ StatusOr<bool> HloElementTypeConverter::Run(HloModule* module) {
       // These are ops with embedded computations where it suffices to convert
       // the embedded computations instead of converting the ops themselves.
       if (opcode == HloOpcode::kWhile || opcode == HloOpcode::kCall ||
+          opcode == HloOpcode::kCrossReplicaSum ||
           opcode == HloOpcode::kFusion || opcode == HloOpcode::kMap ||
           opcode == HloOpcode::kReduce || opcode == HloOpcode::kReduceWindow ||
           opcode == HloOpcode::kSelectAndScatter ||
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 1c276b9305..06775d6a9a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -423,8 +423,20 @@ HloInstruction::CreateReducePrecision(const Shape& shape,
 
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateCrossReplicaSum(
-    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
-  return CreateNary(shape, HloOpcode::kCrossReplicaSum, operands);
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* reduce_computation,
+    tensorflow::gtl::ArraySlice<int64> replica_group_ids,
+    const tensorflow::gtl::optional<int64>& channel_id) {
+  // TODO(b/79737069): Remove the CHECK when supported.
+  CHECK(replica_group_ids.empty());
+  CHECK(!channel_id.has_value());
+  auto instruction =
+      WrapUnique(new HloInstruction(HloOpcode::kCrossReplicaSum, shape));
+  for (auto operand : operands) {
+    instruction->AppendOperand(operand);
+  }
+  instruction->called_computations_.push_back(reduce_computation);
+  return instruction;
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateInfeed(
@@ -1374,7 +1386,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateFft(shape, new_operands[0], fft_type_, fft_length_);
       break;
     case HloOpcode::kCrossReplicaSum:
-      clone = CreateCrossReplicaSum(shape, new_operands);
+      clone = CreateCrossReplicaSum(shape, new_operands, to_apply());
       break;
     case HloOpcode::kGetTupleElement:
       CHECK_EQ(new_operands.size(), 1);
@@ -1762,7 +1774,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
-    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
@@ -1887,6 +1898,7 @@ bool HloInstruction::IdenticalSlowPath(
              slice_limits_ == other.slice_limits_ &&
              slice_strides_ == other.slice_strides_;
     case HloOpcode::kCall:
+    case HloOpcode::kCrossReplicaSum:
     case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kCustomCall:
@@ -2034,6 +2046,7 @@ HloComputation* HloInstruction::to_apply() const {
     case HloOpcode::kMap:
     case HloOpcode::kReduceWindow:
     case HloOpcode::kReduce:
+    case HloOpcode::kCrossReplicaSum:
       CHECK_EQ(called_computations_.size(), 1);
       return called_computations_[0];
     default:
@@ -2356,7 +2369,8 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
                              PrintName(false_computation()->name(), options)));
     } else if (opcode() == HloOpcode::kCall || opcode() == HloOpcode::kMap ||
                opcode() == HloOpcode::kReduceWindow ||
-               opcode() == HloOpcode::kReduce) {
+               opcode() == HloOpcode::kReduce ||
+               opcode() == HloOpcode::kCrossReplicaSum) {
       extra.push_back(
           StrCat("to_apply=", PrintName(to_apply()->name(), options)));
     } else if (!called_computations().empty()) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 905ea5310d..ef55c6668f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -426,10 +426,26 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand, const int exponent_bits,
       const int mantissa_bits);
 
-  // Creates a cross replica sum op.
+  // Creates a cross replica reduction op.
+  //
+  // `reduction_computation`: the reduction function.
+  //
+  // `replica_group_ids`: maps replica ids to subgroup ids. If empty, all
+  // replicas belong to one group. Allreduce will be applied within subgroups.
+  // For example, we have 4 replicas, then replica_group_ids={0,1,0,1} means,
+  // replica 0 and 2 are in subgroup 0, replica 1 and 3 are in subgroup 1.
+  //
+  // `channel_id`: for Allreduce nodes from different models, if they have the
+  // same channel_id, they will be 'Allreduce'd. If empty, Allreduce will not be
+  // applied cross models.
+  //
+  // TODO(b/79737069): Rename this to AllReduce.
   static std::unique_ptr<HloInstruction> CreateCrossReplicaSum(
-      const Shape& shape,
-      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      HloComputation* reduce_computation,
+      tensorflow::gtl::ArraySlice<int64> replica_group_ids = {},
+      const tensorflow::gtl::optional<int64>& channel_id =
+          tensorflow::gtl::nullopt);
 
   // Creates a conversion instruction, where operand is the data to convert and
   // shape is the target shape for the conversion.
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index ec20606d2f..3eadedfe1f 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -587,11 +587,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       break;
     }
     case HloOpcode::kCrossReplicaSum: {
+      optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
       instruction = builder->AddInstruction(
-          HloInstruction::CreateCrossReplicaSum(shape, operands));
+          HloInstruction::CreateCrossReplicaSum(shape, operands, *to_apply));
       break;
     }
     case HloOpcode::kReshape: {
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 84a981675f..08068dc504 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -898,6 +898,24 @@ ENTRY Gather {
   ROOT gather = f32[10,9,8,7,30,29,28,27,26]{8,7,6,5,4,3,2,1,0} gather(input_tensor, gather_indices), output_window_dims={4,5,6,7,8}, elided_window_dims={}, gather_dims_to_operand_dims={0,1,2,3,4}, index_vector_dim=4, window_bounds={30,29,28,27,26}
 }
 
+)"
+},
+// cross-replica-sum
+{
+"CrossReplicaSum",
+R"(HloModule CRS
+
+add {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT add = f32[] add(lhs, rhs)
+}
+
+ENTRY CRS {
+  input = f32[8]{0} parameter(0)
+  ROOT crs = f32[8]{0} cross-replica-sum(input), to_apply=add
+}
+
 )"
 },
   });
diff --git a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
index c960b3c15f..b151187c4b 100644
--- a/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
+++ b/tensorflow/compiler/xla/tests/cross_replica_sum_test.cc
@@ -32,9 +32,16 @@ class TrivialCrossReplicaSumTest : public HloTestBase {};
 XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) {
   const char* module_str = R"(
   HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
   ENTRY test_computation {
     p = f32[3] parameter(0)
-    ROOT crs = f32[3] cross-replica-sum(p)
+    ROOT crs = f32[3] cross-replica-sum(p), to_apply=add
   })";
   auto module =
       ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
@@ -45,10 +52,17 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, OneOperand) {
 XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) {
   const char* module_str = R"(
   HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
   ENTRY test_computation {
     p0 = f32[3] parameter(0)
     p1 = f32[2] parameter(1)
-    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1)
+    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add
   })";
   auto module =
       ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
@@ -65,10 +79,17 @@ XLA_TEST_F(TrivialCrossReplicaSumTest, MultipleOperands) {
 XLA_TEST_F(TrivialCrossReplicaSumTest, ConstantOperand) {
   const char* module_str = R"(
   HloModule test
+
+  add {
+    x = f32[] parameter(0)
+    y = f32[] parameter(1)
+    add = f32[] add(x, y)
+  }
+
   ENTRY test_computation {
     p0 = f32[3] parameter(0)
     p1 = f32[2] constant({10, 20})
-    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1)
+    ROOT crs = (f32[3], f32[2]) cross-replica-sum(p0, p1), to_apply=add
   })";
   auto module =
       ParseHloString(module_str, GetModuleConfigForTest()).ValueOrDie();
-- 
GitLab


From 94154af95e6a8f32bd50791a81a64c0bc3154ca4 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 16:29:00 -0700
Subject: [PATCH 351/610] Adding the autograph operators dependency to the pip
 package.

---
 tensorflow/tools/pip_package/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e113565f45..9d4148c07f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -59,6 +59,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/converters:converters",
     "//tensorflow/contrib/autograph/converters:test_lib",
     "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-- 
GitLab


From 902832ae7f80a610f8e685396cc60f426b9c1292 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 16:29:35 -0700
Subject: [PATCH 352/610] Add the dart rule to tensorflow/core:protos_all.

PiperOrigin-RevId: 199377753
---
 tensorflow/core/BUILD                             | 3 +++
 tensorflow/core/platform/default/build_config.bzl | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 28af3ce4ea..8e9d0eb0d5 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -231,6 +231,7 @@ tf_proto_library(
     name = "protos_all",
     srcs = [],
     cc_api_version = 2,
+    dart_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
@@ -2232,6 +2233,7 @@ tf_proto_library(
     name = "error_codes_proto",
     srcs = ERROR_CODES_PROTO_SRCS,
     cc_api_version = 2,
+    dart_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
@@ -2254,6 +2256,7 @@ tf_proto_library(
     name = "protos_all_proto",
     srcs = COMMON_PROTO_SRCS + ADDITIONAL_CORE_PROTO_SRCS,
     cc_api_version = 2,
+    dart_api_version = 2,
     default_header = True,
     j2objc_api_version = 1,
     java_api_version = 2,
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 43fe82cc13..47f7e29556 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -304,6 +304,7 @@ def tf_proto_library_cc(name, srcs = [], has_services = None,
                         cc_grpc_version = None,
                         j2objc_api_version = 1,
                         cc_api_version = 2,
+                        dart_api_version = 2,
                         java_api_version = 2, py_api_version = 2,
                         js_api_version = 2, js_codegen = "jspb",
                         default_header = False):
@@ -409,7 +410,7 @@ def tf_proto_library(name, srcs = [], has_services = None,
                      visibility = [], testonly = 0,
                      cc_libs = [],
                      cc_api_version = 2, cc_grpc_version = None,
-                     j2objc_api_version = 1,
+                     dart_api_version = 2, j2objc_api_version = 1,
                      java_api_version = 2, py_api_version = 2,
                      js_api_version = 2, js_codegen = "jspb",
                      provide_cc_alias = False,
-- 
GitLab


From 490a6f55e4fe73c7cc1bc136684dbfab1da6f7c6 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Tue, 5 Jun 2018 16:29:00 -0700
Subject: [PATCH 353/610] Adding the autograph operators dependency to the pip
 package.

---
 tensorflow/tools/pip_package/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index e113565f45..9d4148c07f 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -59,6 +59,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/autograph/converters:converters",
     "//tensorflow/contrib/autograph/converters:test_lib",
     "//tensorflow/contrib/autograph/impl:impl",
+    "//tensorflow/contrib/autograph/operators:operators",
     "//tensorflow/contrib/autograph/pyct:pyct",
     "//tensorflow/contrib/autograph/pyct/static_analysis:static_analysis",
     "//tensorflow/contrib/boosted_trees:boosted_trees_pip",
-- 
GitLab


From 8a141854d81a9135a3658255c5813c5277364d01 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Tue, 5 Jun 2018 17:34:20 -0700
Subject: [PATCH 354/610] [XLA] Add a bytes read+written table to the end of
 --xla_hlo_profile.

This is useful when tuning fusion heuristics -- you expect this number
to go down (even if the total runtime doesn't go down, due to suboptimal
emitters).

PiperOrigin-RevId: 199386923
---
 .../service/human_readable_profile_builder.cc   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
index dc3bfce0c4..d7458c338e 100644
--- a/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
+++ b/tensorflow/compiler/xla/service/human_readable_profile_builder.cc
@@ -169,6 +169,23 @@ string HumanReadableProfileBuilder::ToString() const {
       StrAppend(&s, table.MakeReport(CyclesToMicroseconds(total_cycles_)));
     }
   }
+
+  if (total_bytes > 0) {
+    MetricTableReport table;
+    table.SetMetricName("MiB read+written");
+    table.SetEntryName("ops");
+    table.SetShowCategoryTable();
+    for (const auto& op : op_infos_) {
+      MetricTableReport::Entry entry;
+      entry.text = op.name;
+      entry.short_text = op.short_name;
+      entry.category_text = op.category;
+      entry.metric = static_cast<double>(op.bytes_accessed) / (1 << 20);
+      table.AddEntry(std::move(entry));
+    }
+    StrAppend(&s,
+              table.MakeReport(static_cast<double>(total_bytes) / (1 << 20)));
+  }
   return s;
 }
 
-- 
GitLab


From 5105350be955422169de1f22bb99f928c1f4c2ae Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Tue, 5 Jun 2018 17:47:19 -0700
Subject: [PATCH 355/610] Moves generated android_sdk() and android_ndk() repo
 rules out of WORKSPACE.

These rules currently get written by configure.py script to WORKSPACE
file which is not ideal since (1) WORKSPACE file is tracked by git and
(2) we require users to manually delete the rules in order to
update/regenerate them.

Moving these rules into an external repo that is generated based on
several ENV variables set by the configure.py script. Modifying any
of these ENV variables will cause the rules to be updated.

PiperOrigin-RevId: 199388460
---
 WORKSPACE                                     | 24 +----
 configure.py                                  | 94 ++++++-------------
 third_party/android/BUILD                     |  0
 third_party/android/android.bzl.tpl           |  9 ++
 .../android/android_configure.BUILD.tpl       |  0
 third_party/android/android_configure.bzl     | 87 +++++++++++++++++
 6 files changed, 129 insertions(+), 85 deletions(-)
 create mode 100644 third_party/android/BUILD
 create mode 100644 third_party/android/android.bzl.tpl
 create mode 100644 third_party/android/android_configure.BUILD.tpl
 create mode 100644 third_party/android/android_configure.bzl

diff --git a/WORKSPACE b/WORKSPACE
index 4ddfb9a383..fd7570a80a 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -22,26 +22,10 @@ check_bazel_version_at_least("0.10.0")
 
 load("//tensorflow:workspace.bzl", "tf_workspace")
 
-# Uncomment and update the paths in these entries to build the Android demo.
-#android_sdk_repository(
-#    name = "androidsdk",
-#    api_level = 23,
-#    # Ensure that you have the build_tools_version below installed in the
-#    # SDK manager as it updates periodically.
-#    build_tools_version = "26.0.1",
-#    # Replace with path to Android SDK on your system
-#    path = "<PATH_TO_SDK>",
-#)
-#
-#android_ndk_repository(
-#    name="androidndk",
-#    path="<PATH_TO_NDK>",
-#    # This needs to be 14 or higher to compile TensorFlow.
-#    # Please specify API level to >= 21 to build for 64-bit
-#    # archtectures or the Android NDK will automatically select biggest
-#    # API level that it supports without notice.
-#    # Note that the NDK version is not the API level.
-#    api_level=14)
+load("//third_party/android:android_configure.bzl", "android_configure")
+android_configure(name="local_config_android")
+load("@local_config_android//:android.bzl", "android_workspace")
+android_workspace()
 
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
diff --git a/configure.py b/configure.py
index b6c32543cf..bde7af8c0e 100644
--- a/configure.py
+++ b/configure.py
@@ -670,8 +670,9 @@ def create_android_ndk_rule(environ_cp):
       error_msg=('The path %s or its child file "source.properties" '
                  'does not exist.')
   )
-
-  write_android_ndk_workspace_rule(android_ndk_home_path)
+  write_action_env_to_bazelrc('ANDROID_NDK_HOME', android_ndk_home_path)
+  write_action_env_to_bazelrc('ANDROID_NDK_API_LEVEL',
+                              check_ndk_level(android_ndk_home_path))
 
 
 def create_android_sdk_rule(environ_cp):
@@ -733,41 +734,12 @@ def create_android_sdk_rule(environ_cp):
       error_msg=('The selected SDK does not have build-tools version %s '
                  'available.'))
 
-  write_android_sdk_workspace_rule(android_sdk_home_path,
-                                   android_build_tools_version,
-                                   android_api_level)
-
-
-def write_android_sdk_workspace_rule(android_sdk_home_path,
-                                     android_build_tools_version,
-                                     android_api_level):
-  print('Writing android_sdk_workspace rule.\n')
-  with open(_TF_WORKSPACE, 'a') as f:
-    f.write("""
-android_sdk_repository(
-  name="androidsdk",
-  api_level=%s,
-  path="%s",
-  build_tools_version="%s")\n
-""" % (android_api_level, android_sdk_home_path, android_build_tools_version))
-
-
-def write_android_ndk_workspace_rule(android_ndk_home_path):
-  print('Writing android_ndk_workspace rule.')
-  ndk_api_level = check_ndk_level(android_ndk_home_path)
-  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
-    print('WARNING: The API level of the NDK in %s is %s, which is not '
-          'supported by Bazel (officially supported versions: %s). Please use '
-          'another version. Compiling Android targets may result in confusing '
-          'errors.\n' % (android_ndk_home_path, ndk_api_level,
-                         _SUPPORTED_ANDROID_NDK_VERSIONS))
-  with open(_TF_WORKSPACE, 'a') as f:
-    f.write("""
-android_ndk_repository(
-  name="androidndk",
-  path="%s",
-  api_level=%s)\n
-""" % (android_ndk_home_path, ndk_api_level))
+  write_action_env_to_bazelrc('ANDROID_BUILD_TOOLS_VERSION',
+                              android_build_tools_version)
+  write_action_env_to_bazelrc('ANDROID_SDK_API_LEVEL',
+                              android_api_level)
+  write_action_env_to_bazelrc('ANDROID_SDK_HOME',
+                              android_sdk_home_path)
 
 
 def check_ndk_level(android_ndk_home_path):
@@ -780,18 +752,16 @@ def check_ndk_level(android_ndk_home_path):
 
   revision = re.search(r'Pkg.Revision = (\d+)', filedata)
   if revision:
-    return revision.group(1)
-  return None
-
-
-def workspace_has_any_android_rule():
-  """Check the WORKSPACE for existing android_*_repository rules."""
-  with open(_TF_WORKSPACE, 'r') as f:
-    workspace = f.read()
-  has_any_rule = re.search(r'^android_[ns]dk_repository',
-                           workspace,
-                           re.MULTILINE)
-  return has_any_rule
+    ndk_api_level = revision.group(1)
+  else:
+    raise Exception('Unable to parse NDK revision.')
+  if int(ndk_api_level) not in _SUPPORTED_ANDROID_NDK_VERSIONS:
+    print('WARNING: The API level of the NDK in %s is %s, which is not '
+          'supported by Bazel (officially supported versions: %s). Please use '
+          'another version. Compiling Android targets may result in confusing '
+          'errors.\n' % (android_ndk_home_path, ndk_api_level,
+                         _SUPPORTED_ANDROID_NDK_VERSIONS))
+  return ndk_api_level
 
 
 def set_gcc_host_compiler_path(environ_cp):
@@ -1223,7 +1193,7 @@ def set_tf_cuda_compute_capabilities(environ_cp):
     # Check whether all capabilities from the input is valid
     all_valid = True
     # Remove all whitespace characters before splitting the string
-    # that users may insert by accident, as this will result in error 
+    # that users may insert by accident, as this will result in error
     tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
     for compute_capability in tf_cuda_compute_capabilities.split(','):
       m = re.match('[0-9]+.[0-9]+', compute_capability)
@@ -1551,21 +1521,15 @@ def main():
   set_cc_opt_flags(environ_cp)
   set_windows_build_flags()
 
-  if workspace_has_any_android_rule():
-    print('The WORKSPACE file has at least one of ["android_sdk_repository", '
-          '"android_ndk_repository"] already set. Will not ask to help '
-          'configure the WORKSPACE. Please delete the existing rules to '
-          'activate the helper.\n')
-  else:
-    if get_var(
-        environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
-        False,
-        ('Would you like to interactively configure ./WORKSPACE for '
-         'Android builds?'),
-        'Searching for NDK and SDK installations.',
-        'Not configuring the WORKSPACE for Android builds.'):
-      create_android_ndk_rule(environ_cp)
-      create_android_sdk_rule(environ_cp)
+  if get_var(
+      environ_cp, 'TF_SET_ANDROID_WORKSPACE', 'android workspace',
+      False,
+      ('Would you like to interactively configure ./WORKSPACE for '
+       'Android builds?'),
+      'Searching for NDK and SDK installations.',
+      'Not configuring the WORKSPACE for Android builds.'):
+    create_android_ndk_rule(environ_cp)
+    create_android_sdk_rule(environ_cp)
 
   print('Preconfigured Bazel build configs. You can use any of the below by '
         'adding "--config=<>" to your build command. See tools/bazel.rc for '
diff --git a/third_party/android/BUILD b/third_party/android/BUILD
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/android/android.bzl.tpl b/third_party/android/android.bzl.tpl
new file mode 100644
index 0000000000..e6ed4994f3
--- /dev/null
+++ b/third_party/android/android.bzl.tpl
@@ -0,0 +1,9 @@
+"""Set up configurable Android SDK and NDK dependencies."""
+
+def android_workspace():
+  # String for replacement in Bazel template.
+  # These will either be replaced by android_sdk_repository if various ENV
+  # variables are set when `local_config_android` repo_rule is run, or they
+  # will be replaced by noops otherwise.
+  MAYBE_ANDROID_SDK_REPOSITORY
+  MAYBE_ANDROID_NDK_REPOSITORY
diff --git a/third_party/android/android_configure.BUILD.tpl b/third_party/android/android_configure.BUILD.tpl
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/third_party/android/android_configure.bzl b/third_party/android/android_configure.bzl
new file mode 100644
index 0000000000..da09bdf39e
--- /dev/null
+++ b/third_party/android/android_configure.bzl
@@ -0,0 +1,87 @@
+"""Repository rule for Android SDK and NDK autoconfiguration.
+
+`android_configure` depends on the following environment variables:
+
+  * `ANDROID_NDK_HOME`: Location of Android NDK root.
+  * `ANDROID_SDK_HOME`: Location of Android SDK root.
+  * `ANDROID_SDK_API_LEVEL`: Desired Android SDK API version.
+  * `ANDROID_NDK_API_LEVEL`: Desired Android NDK API version.
+  * `ANDROID_BUILD_TOOLS_VERSION`: Desired Android build tools version.
+"""
+
+# TODO(mikecase): Move logic for getting default values for the env variables
+# from configure.py script into this rule.
+
+_ANDROID_NDK_HOME = "ANDROID_NDK_HOME"
+_ANDROID_SDK_HOME = "ANDROID_SDK_HOME"
+_ANDROID_NDK_API_VERSION = "ANDROID_NDK_API_LEVEL"
+_ANDROID_SDK_API_VERSION = "ANDROID_SDK_API_LEVEL"
+_ANDROID_BUILD_TOOLS_VERSION = "ANDROID_BUILD_TOOLS_VERSION"
+
+_ANDROID_SDK_REPO_TEMPLATE = """
+  native.android_sdk_repository(
+      name="androidsdk",
+      path="%s",
+      api_level=%s,
+      build_tools_version="%s",
+  )
+"""
+
+_ANDROID_NDK_REPO_TEMPLATE = """
+  native.android_ndk_repository(
+      name="androidndk",
+      path="%s",
+      api_level=%s,
+  )
+"""
+
+def _android_autoconf_impl(repository_ctx):
+  """Implementation of the android_autoconf repository rule."""
+  sdk_home = repository_ctx.os.environ.get(_ANDROID_SDK_HOME)
+  sdk_api_level = repository_ctx.os.environ.get(_ANDROID_SDK_API_VERSION)
+  build_tools_version = repository_ctx.os.environ.get(
+      _ANDROID_BUILD_TOOLS_VERSION)
+  ndk_home = repository_ctx.os.environ.get(_ANDROID_NDK_HOME)
+  ndk_api_level = repository_ctx.os.environ.get(_ANDROID_NDK_API_VERSION)
+
+  sdk_rule = "pass"
+  if all([sdk_home, sdk_api_level, build_tools_version]):
+    sdk_rule = _ANDROID_SDK_REPO_TEMPLATE % (
+        sdk_home, sdk_api_level, build_tools_version)
+
+  ndk_rule = "pass"
+  if all([ndk_home, ndk_api_level]):
+    ndk_rule = _ANDROID_NDK_REPO_TEMPLATE % (ndk_home, ndk_api_level)
+
+  repository_ctx.template(
+      "BUILD",
+      Label("//third_party/android:android_configure.BUILD.tpl"))
+  repository_ctx.template(
+      "android.bzl",
+      Label("//third_party/android:android.bzl.tpl"),
+      substitutions={
+          "MAYBE_ANDROID_SDK_REPOSITORY": sdk_rule,
+          "MAYBE_ANDROID_NDK_REPOSITORY": ndk_rule,
+      })
+
+android_configure = repository_rule(
+    implementation = _android_autoconf_impl,
+    environ = [
+        _ANDROID_SDK_API_VERSION,
+        _ANDROID_NDK_API_VERSION,
+        _ANDROID_BUILD_TOOLS_VERSION,
+        _ANDROID_NDK_HOME,
+        _ANDROID_SDK_HOME,
+    ],
+)
+"""Writes Android SDK and NDK rules.
+
+Add the following to your WORKSPACE FILE:
+
+```python
+android_configure(name = "local_config_android")
+```
+
+Args:
+  name: A unique name for this workspace rule.
+"""
-- 
GitLab


From 2366bd07dd3fc0e82f34f92deeebdc9cb87649de Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 17:49:21 -0700
Subject: [PATCH 356/610] Automated g4 rollback of changelist 197562826

PiperOrigin-RevId: 199388675
---
 .../optimizers/arithmetic_optimizer.cc        | 151 ++++++++++++++++++
 .../optimizers/arithmetic_optimizer.h         |   1 +
 .../optimizers/arithmetic_optimizer_test.cc   |  57 +++++++
 3 files changed, 209 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 44a14ef7eb..51110b4bda 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2334,6 +2334,156 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
   }
 };
 
+class ConvertPowStage : public ArithmeticOptimizerStage {
+ public:
+  explicit ConvertPowStage(const GraphOptimizerContext& ctx,
+                           const ArithmeticOptimizerContext& ctx_ext)
+      : ArithmeticOptimizerStage("ConvertPow", ctx, ctx_ext) {}
+
+  bool IsSupported(const NodeDef* node) const override {
+    return IsPow(*node) &&
+           ctx().graph_properties->GetInputProperties(node->name()).size() == 2;
+  }
+
+  Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
+    const auto& p = ctx().graph_properties->GetInputProperties(node->name())[1];
+    for (int i = 0; i < p.shape().dim_size(); ++i) {
+      if (p.shape().dim(i).size() < 0) {
+        // skip if p is is not fully defined.
+        return Status::OK();
+      }
+    }
+    if (TensorShape::IsValid(p.shape()) && p.has_value()) {
+      Tensor pow(p.dtype(), p.shape());
+      if (!pow.FromProto(p.value())) {
+        return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                       p.value().DebugString());
+      }
+
+      complex128 prev, curr;
+      for (int i = 0; i < pow.NumElements(); ++i) {
+        TF_RETURN_IF_ERROR(GetElement(pow, i, &curr));
+        if (i != 0 && curr != prev) {
+          // pow has different values on different elements. Skip.
+          return Status::OK();
+        }
+        prev = curr;
+      }
+      NodeDef *x, *y;
+      TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &x));
+      TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &y));
+      if (curr == complex128(2, 0)) {
+        node->set_op("Square");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(1, 0)) {
+        node->set_op("Identity");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(0.5, 0)) {
+        node->set_op("Sqrt");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(0, 0)) {
+        const auto& b =
+            ctx().graph_properties->GetInputProperties(node->name())[0];
+        for (int i = 0; i < b.shape().dim_size(); ++i) {
+          if (b.shape().dim(i).size() < 0) {
+            // skip if b is is not fully defined.
+            return Status::OK();
+          }
+        }
+        if (TensorShape::IsValid(b.shape()) && b.has_value()) {
+          Tensor base(b.dtype(), b.shape());
+          if (!base.FromProto(b.value())) {
+            return errors::InvalidArgument("Cannot parse tensor from proto: ",
+                                           b.value().DebugString());
+          }
+          node->set_op("Const");
+          Tensor c(base.dtype(), base.shape());
+          for (int i = 0; i < c.NumElements(); ++i) {
+            TF_RETURN_IF_ERROR(SetElementToOne(i, &c));
+          }
+          (*node->mutable_attr())["dtype"].set_type(base.dtype());
+          c.AsProtoTensorContent(
+              (*node->mutable_attr())["value"].mutable_tensor());
+          node->mutable_attr()->erase("T");
+          node->set_input(0, AsControlDependency(x->name()));
+          node->set_input(1, AsControlDependency(y->name()));
+          AddToOptimizationQueue(node);
+          AddToOptimizationQueue(x);
+          AddToOptimizationQueue(y);
+        }
+      } else if (curr == complex128(-0.5, 0)) {
+        node->set_op("Rsqrt");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      } else if (curr == complex128(-1, 0)) {
+        node->set_op("Reciprocal");
+        node->set_input(1, AsControlDependency(y->name()));
+        AddToOptimizationQueue(node);
+        AddToOptimizationQueue(y);
+      }
+    }
+    return Status::OK();
+  }
+
+ private:
+  Status GetElement(const Tensor& t, int i, complex128* element) {
+    switch (t.dtype()) {
+      case DT_INT32:
+        *element = complex128(t.flat<int32>()(i));
+        return Status::OK();
+      case DT_INT64:
+        *element = complex128(t.flat<int64>()(i));
+        return Status::OK();
+      case DT_FLOAT:
+        *element = complex128(t.flat<float>()(i));
+        return Status::OK();
+      case DT_DOUBLE:
+        *element = complex128(t.flat<double>()(i));
+        return Status::OK();
+      case DT_COMPLEX64:
+        *element = complex128(t.flat<complex64>()(i));
+        return Status::OK();
+      case DT_COMPLEX128:
+        *element = t.flat<complex128>()(i);
+        return Status::OK();
+      default:
+        return errors::InvalidArgument("Invalid data type: ", t.dtype());
+    }
+  }
+
+  Status SetElementToOne(int i, Tensor* t) {
+    switch (t->dtype()) {
+      case DT_INT32:
+        t->flat<int32>()(i) = 1;
+        return Status::OK();
+      case DT_INT64:
+        t->flat<int64>()(i) = 1L;
+        return Status::OK();
+      case DT_FLOAT:
+        t->flat<float>()(i) = 1.0f;
+        return Status::OK();
+      case DT_DOUBLE:
+        t->flat<double>()(i) = 1.0;
+        return Status::OK();
+      case DT_COMPLEX64:
+        t->flat<complex64>()(i) = complex64(1);
+        return Status::OK();
+      case DT_COMPLEX128:
+        t->flat<complex128>()(i) = complex128(1);
+        return Status::OK();
+      default:
+        return errors::InvalidArgument("Invalid data type: ", t->dtype());
+    }
+  }
+};
+
 }  // namespace
 
 class UniqueNodes {
@@ -2608,6 +2758,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
     pipeline.AddStage<SqrtDivToRsqrtMulStage>(ctx, ctx_ext);
   if (options_.remove_idempotent)
     pipeline.AddStage<RemoveIdempotentStage>(ctx, ctx_ext);
+  if (options_.convert_pow) pipeline.AddStage<ConvertPowStage>(ctx, ctx_ext);
 
   VLOG(1) << "Run " << pipeline.NumStages() << " arithmetic optimizer stages: "
           << str_util::Join(pipeline.StageNames(), ", ");
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
index f37458eba4..40c5e9fc56 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.h
@@ -74,6 +74,7 @@ class ArithmeticOptimizer : public GraphOptimizer {
     bool reorder_cast_and_transpose = true;
     bool replace_mul_with_square = true;
     bool simplify_aggregation = true;
+    bool convert_pow = true;
 
     // Choose which arithmetic optimizer stages will be enabled for a given
     // optimization level by default.
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index 8083b6051f..ff96cb6480 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -245,6 +245,11 @@ class ArithmeticOptimizerTest : public GrapplerTest {
     optimizer->options_.convert_sqrt_div_to_rsqrt_mul = true;
   }
 
+  void EnableOnlyConvertPow(ArithmeticOptimizer* optimizer) {
+    DisableAllStages(optimizer);
+    optimizer->options_.convert_pow = true;
+  }
+
   void EnableOnlyRemoveIdempotent(ArithmeticOptimizer* optimizer) {
     DisableAllStages(optimizer);
     optimizer->options_.remove_idempotent = true;
@@ -2429,6 +2434,58 @@ TEST_F(ArithmeticOptimizerTest, ConvertSqrtDivToRsqrtMul) {
   }
 }
 
+TEST_F(ArithmeticOptimizerTest, ConvertPow) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  auto x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  auto y2 = ops::Const(s.WithOpName("y2"), {2.0f, 2.0f}, {1, 2});
+  auto y1 = ops::Const(s.WithOpName("y1"), {1.0f, 1.0f}, {1, 2});
+  auto yPoint5 = ops::Const(s.WithOpName("y.5"), {0.5f, 0.5f}, {1, 2});
+  auto y0 = ops::Const(s.WithOpName("y0"), {0.0f, 0.0f}, {1, 2});
+  auto y_Point5 = ops::Const(s.WithOpName("y_.5"), {-0.5f, -0.5f}, {1, 2});
+  auto y_1 = ops::Const(s.WithOpName("y_1"), {-1.0f, -1.0f}, {1, 2});
+  auto y = ops::Const(s.WithOpName("y"), {3.0f, 4.0f}, {1, 2});
+  Output out2 = ops::Pow(s.WithOpName("out2"), x, y2);
+  Output out1 = ops::Pow(s.WithOpName("out1"), x, y1);
+  Output outPoint5 = ops::Pow(s.WithOpName("out.5"), x, yPoint5);
+  Output out0 = ops::Pow(s.WithOpName("out0"), x, y0);
+  Output out_Point5 = ops::Pow(s.WithOpName("out_.5"), x, y_Point5);
+  Output out_1 = ops::Pow(s.WithOpName("out_1"), x, y_1);
+  Output out = ops::Pow(s.WithOpName("out"), x, y);
+
+  GrapplerItem item;
+  item.fetch = {"out2", "out1", "out.5", "out0", "out_.5", "out_1", "out"};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(7, tensors_expected.size());
+
+  GraphDef got;
+  ArithmeticOptimizer optimizer;
+  EnableOnlyConvertPow(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &got);
+  auto tensors = EvaluateNodes(got, item.fetch);
+  EXPECT_EQ(7, tensors.size());
+
+  GraphDef want;
+  AddNode("x", "Const", {}, {}, &want);
+  AddNode("y2", "Const", {}, {}, &want);
+  AddNode("y1", "Const", {}, {}, &want);
+  AddNode("y.5", "Const", {}, {}, &want);
+  AddNode("y0", "Const", {}, {}, &want);
+  AddNode("y_.5", "Const", {}, {}, &want);
+  AddNode("y_1", "Const", {}, {}, &want);
+  AddNode("y", "Const", {}, {}, &want);
+  AddNode("out2", "Square", {"x", AsControlDependency("y2")}, {}, &want);
+  AddNode("out1", "Identity", {"x", AsControlDependency("y1")}, {}, &want);
+  AddNode("out.5", "Sqrt", {"x", AsControlDependency("y.5")}, {}, &want);
+  AddNode("out0", "Const",
+          {AsControlDependency("x"), AsControlDependency("y0")}, {}, &want);
+  AddNode("out_.5", "Rsqrt", {"x", AsControlDependency("y_.5")}, {}, &want);
+  AddNode("out_1", "Reciprocal", {"x", AsControlDependency("y_1")}, {}, &want);
+  AddNode("out", "Pow", {"x", "y"}, {}, &want);
+
+  CompareGraphs(want, got);
+}
+
 TEST_F(ArithmeticOptimizerTest, MinimizeBroadcasts_SimpleSwap) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
 
-- 
GitLab


From acded19b17ce082f3fd95fa9c8b75cb82e65706e Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 5 Jun 2018 18:53:44 -0700
Subject: [PATCH 357/610] Fix iOS build.

PiperOrigin-RevId: 199395164
---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8e9d0eb0d5..5ff65f4f72 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -1570,6 +1570,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc_impl",
+        ":stats_calculator_portable",
         "//third_party/eigen3",
         "@double_conversion//:double-conversion",
         "@nsync//:nsync_cpp",
-- 
GitLab


From 98be57ea53cb96ca69fe19a02b2f2bca809a5132 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 5 Jun 2018 19:28:04 -0700
Subject: [PATCH 358/610] Add more logging to report module group metadata
 statistics.

PiperOrigin-RevId: 199397890
---
 .../xla/service/hlo_module_group_metadata.cc  | 36 +++++++++++++++++++
 .../xla/service/hlo_module_group_metadata.h   |  3 ++
 2 files changed, 39 insertions(+)

diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index f6fa45a6b7..4f1715e4ca 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -113,6 +113,9 @@ Status HloModuleGroupMetadata::Build() {
     }
   }
   TF_RETURN_IF_ERROR(VerifyCompanionSets());
+  if (VLOG_IS_ON(4)) {
+    DumpCollectedStats();
+  }
   return Status::OK();
 }
 
@@ -315,6 +318,7 @@ Status HloModuleGroupMetadata::RecordInstructions() {
       TF_RETURN_IF_ERROR(computation->Accept(visitor));
     }
   }
+  VLOG(2) << "Created " << channels_.size() << " channels";
   return Status::OK();
 }
 
@@ -445,4 +449,36 @@ Status HloModuleGroupMetadata::CheckCommunicatingInstruction(
   return FailedPrecondition("channel is used in disallowed computation");
 }
 
+void HloModuleGroupMetadata::DumpCollectedStats() const {
+  std::map<std::pair<int64, int64>, int64> communication_histogram;
+  for (auto& channel : channels_) {
+    auto from_device = GetInstructionDevice(*channel.send);
+    auto to_device = GetInstructionDevice(*channel.recv);
+    LOG(INFO) << "Channel " << channel.id << ": from_device=" << *from_device
+              << " to_device=" << *to_device << " send=" << channel.send->name()
+              << " send_done=" << channel.send_done->name()
+              << " recv=" << channel.recv->name()
+              << " recv_done=" << channel.recv_done->name();
+    communication_histogram[std::pair<int64, int64>(*from_device,
+                                                    *to_device)] += 1;
+  }
+  for (auto& fromto_count : communication_histogram) {
+    LOG(INFO) << "From " << fromto_count.first.first << " to "
+              << fromto_count.first.second << ": " << fromto_count.second;
+  }
+  for (auto& companion_set : companion_sets_) {
+    LOG(INFO) << "Companion set:";
+    for (HloInstruction* instruction : *companion_set) {
+      LOG(INFO) << "  " << instruction->name();
+    }
+  }
+  for (auto& instruction_comm : tracked_instructions_comms_) {
+    LOG(INFO) << "Communicating instruction " << instruction_comm.first->name();
+    for (HloInstruction* instruction : instruction_comm.second) {
+      auto device = GetInstructionDevice(*instruction);
+      LOG(INFO) << "  " << instruction->name() << " on device " << *device;
+    }
+  }
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
index f68d4028dc..ffde3a332d 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.h
@@ -230,6 +230,9 @@ class HloModuleGroupMetadata {
     return it != tracked_instructions_.end() ? &it->second : nullptr;
   }
 
+  // Dump all the collected module group statistics to the logs.
+  void DumpCollectedStats() const;
+
   // List of all companion instructions sets in the module.
   std::vector<std::unique_ptr<std::unordered_set<HloInstruction*>>>
       companion_sets_;
-- 
GitLab


From 0978455a4e3f905bacf3f6f98e7c39b717b5d448 Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Tue, 5 Jun 2018 19:54:32 -0700
Subject: [PATCH 359/610] Add __init__.py to all_reduce.

PiperOrigin-RevId: 199399375
---
 tensorflow/contrib/all_reduce/BUILD       | 10 ++++++
 tensorflow/contrib/all_reduce/__init__.py | 39 +++++++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tensorflow/contrib/all_reduce/__init__.py

diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index 62d1b1cf07..881808a98b 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -11,6 +11,16 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "tf_py_test")
 
+py_library(
+    name = "all_reduce_py",
+    srcs = ["__init__.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":all_reduce",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "all_reduce",
     srcs = [
diff --git a/tensorflow/contrib/all_reduce/__init__.py b/tensorflow/contrib/all_reduce/__init__.py
new file mode 100644
index 0000000000..f9824f4cfb
--- /dev/null
+++ b/tensorflow/contrib/all_reduce/__init__.py
@@ -0,0 +1,39 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""All-reduce implementations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import,line-too-long,wildcard-import
+from tensorflow.contrib.all_reduce.python.all_reduce import *
+
+from tensorflow.python.util.all_util import remove_undocumented
+# pylint: enable=unused-import,line-too-long,wildcard-import
+
+_allowed_symbols = [
+    'build_ring_all_reduce',
+    'build_recursive_hd_all_reduce',
+    'build_shuffle_all_reduce',
+    'build_nccl_all_reduce',
+    'build_nccl_then_ring',
+    'build_nccl_then_recursive_hd',
+    'build_nccl_then_shuffle',
+    'build_shuffle_then_ring',
+    'build_shuffle_then_shuffle'
+]
+
+remove_undocumented(__name__, allowed_exception_list=_allowed_symbols)
-- 
GitLab


From 95cd2d44150a23a3c322a8056ead74b6867cefa2 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Tue, 5 Jun 2018 21:33:06 -0700
Subject: [PATCH 360/610] Disable testLargeCase in metric_ops_test

PiperOrigin-RevId: 199405764
---
 tensorflow/contrib/metrics/python/ops/metric_ops_test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
index 4ccba4a253..b13f08a37d 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops_test.py
@@ -2392,6 +2392,7 @@ class StreamingPrecisionRecallAtEqualThresholdsTest(test.TestCase):
         self._testResultsEqual(initial_result, result)
 
   def testLargeCase(self):
+    self.skipTest("Test consistently timing out")
     shape = [32, 512, 256, 1]
     predictions = random_ops.random_uniform(
         shape, 0.0, 1.0, dtype=dtypes_lib.float32)
-- 
GitLab


From 76c9358e344a4d454784faccfbff4a73d9c0a04a Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Wed, 6 Jun 2018 02:56:29 -0700
Subject: [PATCH 361/610] Minor touch ups to PartitionedCallOp.

Mostly just cosmetic refactoring to make PartitionedCallOp more readable; also registers a GPU kernel.

PiperOrigin-RevId: 199433460
---
 .../core/kernels/partitioned_function_ops.cc  | 190 ++++++++++--------
 1 file changed, 108 insertions(+), 82 deletions(-)

diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index d66b1ba663..b6ee808091 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/graph/graph_partition.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
 
 #if GOOGLE_CUDA
@@ -41,7 +42,8 @@ namespace {
 // TODO(akshayka): Support distributed execution.
 class PartitionedCallOp : public AsyncOpKernel {
  public:
-  explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
+  explicit PartitionedCallOp(OpKernelConstruction* ctx)
+      : AsyncOpKernel(ctx), local_device_name_(ctx->device()->name()) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
   }
 
@@ -73,92 +75,28 @@ class PartitionedCallOp : public AsyncOpKernel {
     {
       mutex_lock l(mu_);
       if (!partitioned_) {
-        // Instantiate the function to obtain its underlying graph, complete
-        // with nodes for arguments and return values.
-        FunctionLibraryRuntime::InstantiateOptions opts;
-        FHandle handle;
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            lib->Instantiate(func_.name(), AttrSlice(&func_.attr()), opts,
-                             &handle),
-            done);
-        Graph* graph = lib->GetFunctionBody(handle)->graph;
+        auto graph = tensorflow::MakeUnique<Graph>(OpRegistry::Global());
+        OP_REQUIRES_OK_ASYNC(ctx, GetGraphFromFunction(lib, graph.get()), done);
 
-        // Pin the inputs and outputs to the local device to simplify the
-        // function-dispatching logic.
-        local_device_name_ = lib->device()->name();
-        for (Node* node : graph->op_nodes()) {
-          string node_type = node->type_string();
-          if (node_type == FunctionLibraryDefinition::kArgOp ||
-              node_type == FunctionLibraryDefinition::kRetOp) {
-            node->set_assigned_device_name(local_device_name_);
-          }
-        }
-
-        // Place the graph, i.e,. assign a device to every node in it.
         DeviceSet device_set;
         for (auto d : lib->device_mgr()->ListDevices()) {
           device_set.AddDevice(d);
         }
-        Placer placer(graph, &device_set);
+        Placer placer(graph.get(), &device_set);
         OP_REQUIRES_OK_ASYNC(ctx, placer.Run(), done);
 
-        // Partition the graph into subgraphs: exactly one subgraph per device.
-        //
-        // TODO(akshayka): Let devices rewrite their graphs.
-        PartitionOptions partition_options;
-        partition_options.node_to_loc = [](const Node* node) {
-          // TODO(akshayka): To better support the distributed case, first split
-          // the graph by worker (e.g,. using the master session's
-          // `SplitByWorker` policy), and then recursively partition the
-          // per-worker shards at the remote worker(s).
-          return node->assigned_device_name();
-        };
-        int64 edge_name_counter = 0;
-        partition_options.new_name =
-            [&edge_name_counter](const string& prefix) {
-              return strings::StrCat(prefix, "/_", ++edge_name_counter);
-            };
-        partition_options.get_incarnation =
-            [&device_set](const string& name) -> int64 {
-          const Device* d = device_set.FindDeviceByName(name);
-          if (d == nullptr) {
-            return PartitionOptions::kIllegalIncarnation;
-          } else {
-            return d->attributes().incarnation();
-          }
-        };
-        partition_options.control_flow_added = false;
-        std::unordered_map<string, GraphDef> partitions;
+        std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
         OP_REQUIRES_OK_ASYNC(
-            ctx, Partition(partition_options, graph, &partitions), done);
-
-        VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
-                << partitions.size() << " shards.";
-
-        // `subgraphs` is a map from devices to their corresponding subgraphs.
-        gtl::FlatMap<string, std::unique_ptr<Graph>> subgraphs;
-        const FunctionLibraryDefinition* flib_def = &graph->flib_def();
-        for (const auto& partition : partitions) {
-          std::unique_ptr<Graph> subgraph(new Graph(flib_def));
-          GraphConstructorOptions opts;
-          opts.allow_internal_ops = true;
-          opts.expect_device_spec = true;
-          const string& device = partition.first;
-          const GraphDef& graph_def = partition.second;
-          OP_REQUIRES_OK_ASYNC(
-              ctx, ConvertGraphDefToGraph(opts, graph_def, subgraph.get()),
-              done);
-          subgraphs.emplace(device, std::move(subgraph));
-        }
+            ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
+            done);
 
         // The FunctionLibraryRuntime's library cannot be mutated from within
-        // an OpKernel, so the functions are instantiated in an overlay library.
+        // an OpKernel, so functions are instantiated in an overlay library.
         overlay_lib_.reset(new FunctionLibraryDefinition(
             *lib->GetFunctionLibraryDefinition()));
         for (const auto& pair : subgraphs) {
           const string& target = pair.first;
-          Graph* subgraph = pair.second.get();
+          const auto& subgraph = pair.second;
           FunctionDef shard;
           string unique_name = UniquifyFunctionName(func_.name());
           OP_REQUIRES_OK_ASYNC(
@@ -173,12 +111,96 @@ class PartitionedCallOp : public AsyncOpKernel {
               lib->Instantiate(unique_name, AttrSlice(&shard.attr()), opts,
                                &handle),
               done);
-          device_handle_map_.emplace(target, handle);
+          function_handles_.emplace(target, handle);
         }
         partitioned_ = true;
       }
     }
+    ExecuteFunctions(lib, ctx, std::move(done));
+  }
+
+ private:
+  typedef std::pair<string, FHandle> DeviceAndFHandle;
+
+  // `func_` encapsulates the original, unsharded function.
+  // Copies the graph backing `func_` into `*graph`, pinning the input and
+  // output nodes to the local device.
+  //
+  // `*graph` must be a freshly allocated graph.
+  Status GetGraphFromFunction(FunctionLibraryRuntime* lib, Graph* graph) {
+    FunctionLibraryRuntime::InstantiateOptions opts;
+    FHandle handle;
+    TF_RETURN_IF_ERROR(lib->Instantiate(func_.name(), AttrSlice(&func_.attr()),
+                                        opts, &handle));
+    const FunctionBody* fbody = lib->GetFunctionBody(handle);
+    if (fbody == nullptr) {
+      return errors::Internal("Could not find handle ", handle);
+    }
+    CopyGraph(*fbody->graph, graph);
 
+    // Pin the inputs and outputs to the local device to simplify the
+    // function-dispatching logic.
+    for (Node* node : graph->op_nodes()) {
+      string node_type = node->type_string();
+      if (node_type == FunctionLibraryDefinition::kArgOp ||
+          node_type == FunctionLibraryDefinition::kRetOp) {
+        node->set_assigned_device_name(local_device_name_);
+      }
+    }
+    return Status::OK();
+  }
+
+  // Partitions `graph` and populates `subgraphs` with the partitions.
+  Status PartitionHelper(
+      const DeviceSet& device_set, std::unique_ptr<Graph> graph,
+      std::unordered_map<string, std::unique_ptr<Graph>>* subgraphs) {
+    PartitionOptions partition_options;
+    partition_options.node_to_loc = [](const Node* node) {
+      // TODO(akshayka): To better support the distributed case, first split
+      // the graph by worker (e.g,. using the master session's
+      // `SplitByWorker` policy), and then recursively partition the
+      // per-worker shards at the remote worker(s).
+      return node->assigned_device_name();
+    };
+    int64 edge_name_counter = 0;
+    partition_options.new_name = [&edge_name_counter](const string& prefix) {
+      return strings::StrCat(prefix, "/_", ++edge_name_counter);
+    };
+    partition_options.get_incarnation =
+        [&device_set](const string& name) -> int64 {
+      const Device* d = device_set.FindDeviceByName(name);
+      if (d == nullptr) {
+        return PartitionOptions::kIllegalIncarnation;
+      } else {
+        return d->attributes().incarnation();
+      }
+    };
+    partition_options.control_flow_added = false;
+    std::unordered_map<string, GraphDef> partitions;
+    TF_RETURN_IF_ERROR(Partition(partition_options, graph.get(), &partitions));
+
+    VLOG(3) << "Partitioned function '" << func_.name() << "', yielding "
+            << partitions.size() << " shards.";
+
+    const FunctionLibraryDefinition* flib_def = &graph->flib_def();
+    for (const auto& partition : partitions) {
+      std::unique_ptr<Graph> subgraph(new Graph(flib_def));
+      GraphConstructorOptions opts;
+      opts.allow_internal_ops = true;
+      opts.expect_device_spec = true;
+      const string& device = partition.first;
+      const GraphDef& graph_def = partition.second;
+      TF_RETURN_IF_ERROR(
+          ConvertGraphDefToGraph(opts, graph_def, subgraph.get()));
+      subgraphs->emplace(device, std::move(subgraph));
+    }
+
+    return Status::OK();
+  }
+
+  // Executes the partitioned functions.
+  void ExecuteFunctions(FunctionLibraryRuntime* lib, OpKernelContext* ctx,
+                        DoneCallback done) LOCKS_EXCLUDED(mu_) {
     FunctionLibraryRuntime::Options opts;
     opts.step_id = ctx->step_id();
     opts.step_container = ctx->step_container();
@@ -205,11 +227,11 @@ class PartitionedCallOp : public AsyncOpKernel {
         },
         rendez, std::move(done), std::placeholders::_1);
     auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 1; i < device_handle_map_.size(); ++i) {
+    for (int i = 1; i < function_handles_.size(); ++i) {
       refcounted_done->Ref();
     }
 
-    for (const auto& pair : device_handle_map_) {
+    for (const auto& pair : function_handles_) {
       const string& target_device = pair.first;
       FHandle handle = pair.second;
       VLOG(3) << "Running function shard on device " << target_device;
@@ -247,8 +269,6 @@ class PartitionedCallOp : public AsyncOpKernel {
       }
     }
   }
-
- private:
   string UniquifyFunctionName(const string& name) {
     for (;; ++suffix_) {
       const string candidate = strings::StrCat(name, "_", suffix_);
@@ -258,13 +278,13 @@ class PartitionedCallOp : public AsyncOpKernel {
     }
   }
 
-  // `func_` encapsulates the original, unsharded function.
   NameAttrList func_;
-  string local_device_name_;
+  const string local_device_name_;
   // Function shards are added to `overlay_lib_`.
   std::unique_ptr<FunctionLibraryDefinition> overlay_lib_;
-  // A map from device names to handles of function shards.
-  gtl::FlatMap<string, FHandle> device_handle_map_;
+  // A map from device names to handles of function shards; this map is
+  // read-only after the first execution of the OpKernel.
+  gtl::FlatMap<string, FHandle> function_handles_;
 
   mutex mu_;
   bool partitioned_ GUARDED_BY(mu_) = false;
@@ -274,6 +294,12 @@ class PartitionedCallOp : public AsyncOpKernel {
 };
 REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_CPU),
                         PartitionedCallOp);
+REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_GPU),
+                        PartitionedCallOp);
+#if TENSORFLOW_USE_SYCL
+REGISTER_KERNEL_BUILDER(Name("PartitionedCall").Device(DEVICE_SYCL),
+                        PartitionedCallOp);
+#endif  // TENSORFLOW_USE_SYCL
 
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From abd8348587b765aa6a72469a92d03c02802dbcef Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 05:39:15 -0700
Subject: [PATCH 362/610] Tensorflow protos allow enum values outside of the
 listed constants; this is now properly supported in the Text Format.

PiperOrigin-RevId: 199450074
---
 .../gen_proto_text_functions_lib.cc           | 25 +++++++++++++------
 .../gen_proto_text_functions_lib_test.cc      |  5 +++-
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
index 62e29b5128..29add6d5ea 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib.cc
@@ -279,8 +279,13 @@ void Generator::AppendFieldValueAppend(const FieldDescriptor& field,
       if (omit_default) {
         Print("if (", field_expr, " != 0) {").Nest();
       }
-      Print("o->AppendEnumName(\"", field.name(), "\", ",
-            GetQualifiedEnumNameFn(*field.enum_type()), "(", field_expr, "));");
+      Print("const char* enum_name = ",
+            GetQualifiedEnumNameFn(*field.enum_type()), "(", field_expr, ");");
+      Print("if (enum_name[0]) {").Nest();
+      Print("o->AppendEnumName(\"", field.name(), "\", enum_name);");
+      Unnest().Print("} else {").Nest();
+      Print("o->AppendNumeric(\"", field.name(), "\", ", field_expr, ");");
+      Unnest().Print("}");
       if (omit_default) {
         Unnest().Print("}");
       }
@@ -540,18 +545,24 @@ void Generator::AppendParseMessageFunction(const Descriptor& md) {
       for (int enum_i = 0; enum_i < enum_d->value_count(); ++enum_i) {
         const auto* value_d = enum_d->value(enum_i);
         const string& value_name = value_d->name();
-        string condition = StrCat("value == \"", value_name,
-                                  "\" || value == \"", value_d->number(), "\"");
-        if (value_d->number() == 0) {
-          StrAppend(&condition, " || value == \"-0\"");
-        }
+        string condition = StrCat("value == \"", value_name, "\"");
 
         Print(enum_i == 0 ? "" : "} else ", "if (", condition, ") {");
         Nest();
         Print(set_value_prefix, "(", value_prefix, value_name, ");");
         Unnest();
       }
+      Print("} else {");
+      Nest();
+      // Proto3 allows all numeric values.
+      Print("int32 int_value;");
+      Print("if (strings::SafeStringToNumeric(value, &int_value)) {");
+      Nest();
+      Print(set_value_prefix, "(static_cast<", GetQualifiedName(*enum_d),
+            ">(int_value));");
+      Unnest();
       Print("} else {").Nest().Print("return false;").Unnest().Print("}");
+      Unnest().Print("}");
     } else {
       Print(field->cpp_type_name(), " value;");
       switch (field->cpp_type()) {
diff --git a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
index 6f0b4f47de..e67add72de 100644
--- a/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
+++ b/tensorflow/tools/proto_text/gen_proto_text_functions_lib_test.cc
@@ -455,7 +455,10 @@ TEST(CreateProtoDebugStringLibTest, Enums) {
        "repeated_nested_enum: 1"));
 
   EXPECT_PARSE_SUCCESS("", "optional_nested_enum: -0");
-  EXPECT_PARSE_FAILURE("optional_nested_enum: 6");
+  // TODO(amauryfa): restore the line below when protobuf::TextFormat also
+  // supports unknonwn enum values.
+  // EXPECT_PARSE_SUCCESS("optional_nested_enum: 6", "optional_nested_enum: 6");
+  EXPECT_PARSE_FAILURE("optional_nested_enum: 2147483648");  // > INT32_MAX
   EXPECT_PARSE_FAILURE("optional_nested_enum: BARNONE");
   EXPECT_PARSE_FAILURE("optional_nested_enum: 'BAR'");
   EXPECT_PARSE_FAILURE("optional_nested_enum: \"BAR\" ");
-- 
GitLab


From 0e4532d1123c42f21d9cfd68e90ed5982e305574 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 6 Jun 2018 08:56:29 -0400
Subject: [PATCH 363/610] Update __init__.py

Whitelist the operators module in the main library.
---
 tensorflow/contrib/autograph/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 310eb34a70..79d73af980 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -41,7 +42,8 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
-    # Special functions
+    # Special functions and overloaded operators
+    'operators',
     'stack',
     # Exceptions
     'AutographParseError',
-- 
GitLab


From 47d42a1ff373520b0f8abbfe655161c9ec0f9e84 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 07:45:17 -0700
Subject: [PATCH 364/610] Internal change

PiperOrigin-RevId: 199464493
---
 .../contrib/lite/models/smartreply/predictor_test.cc      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
index e6c8d966f1..c7e08814fd 100644
--- a/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
+++ b/tensorflow/contrib/lite/models/smartreply/predictor_test.cc
@@ -35,8 +35,8 @@ const char kModelName[] = "smartreply_ondevice_model.bin";
 const char kSamples[] = "smartreply_samples.tsv";
 
 string TestDataPath() {
-  return string(StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
-                       "contrib/lite/models/testdata/"));
+  return string(absl::StrCat(tensorflow::testing::TensorFlowSrcRoot(), "/",
+                             "contrib/lite/models/testdata/"));
 }
 
 MATCHER_P(IncludeAnyResponesIn, expected_response, "contains the response") {
@@ -55,7 +55,7 @@ class PredictorTest : public ::testing::Test {
  protected:
   PredictorTest() {
     model_ = tflite::FlatBufferModel::BuildFromFile(
-        StrCat(TestDataPath(), "/", kModelName).c_str());
+        absl::StrCat(TestDataPath(), "/", kModelName).c_str());
     CHECK(model_);
   }
   ~PredictorTest() override {}
@@ -121,7 +121,7 @@ TEST_F(PredictorTest, BatchTest) {
   int total_triggers = 0;
 
   string line;
-  std::ifstream fin(StrCat(TestDataPath(), "/", kSamples));
+  std::ifstream fin(absl::StrCat(TestDataPath(), "/", kSamples));
   while (std::getline(fin, line)) {
     const std::vector<string> fields = absl::StrSplit(line, '\t');
     if (fields.empty()) {
-- 
GitLab


From c1b9ac9f215a3a83f7f0b6233bf4cef0b3e74598 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 6 Jun 2018 07:50:37 -0700
Subject: [PATCH 365/610] Error checking in c/python code.

PiperOrigin-RevId: 199465056
---
 tensorflow/python/util/util.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 8e839b523e..0dd406aa4e 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -243,6 +243,9 @@ bool GetNextValuesForIterable(PyObject* nested,
                               std::vector<Safe_PyObjectPtr>* next_values) {
   PyObject* item;
   PyObject* iterator = PyObject_GetIter(nested);
+  if (iterator == nullptr || PyErr_Occurred()) {
+    return false;
+  }
   while ((item = PyIter_Next(iterator)) != nullptr) {
     next_values->emplace_back(item);
   }
-- 
GitLab


From 5c26ec27e5ac23a16d9037b102df8216f821c477 Mon Sep 17 00:00:00 2001
From: James Keeling <jtkeeling@google.com>
Date: Wed, 6 Jun 2018 09:06:25 -0700
Subject: [PATCH 366/610] Clarify documentation of Dataset.filter

It was not explicitly stated that the predicate should return True for elements the user wants to keep.

PiperOrigin-RevId: 199474340
---
 tensorflow/python/data/ops/dataset_ops.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index ea5fc2099c..5f17444797 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -967,7 +967,8 @@ class Dataset(object):
         scalar `tf.bool` tensor.
 
     Returns:
-      Dataset: A `Dataset`.
+      Dataset: The `Dataset` containing the elements of this dataset for which
+          `predicate` is `True`.
     """
     return FilterDataset(self, predicate)
 
-- 
GitLab


From 30947aa455449215dc31c13e635bbd207795e18e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 09:26:39 -0700
Subject: [PATCH 367/610] Automated g4 rollback of changelist 199140117

PiperOrigin-RevId: 199476694
---
 tensorflow/contrib/distribute/python/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index a91c54153f..3118deaa47 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -311,7 +311,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_pip",
-        "noguitar",  # TODO(b/109653107): test is flaky.
     ],
 )
 
-- 
GitLab


From 18ef24b3023caed667a728c77b16c4e13e859ff2 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Wed, 6 Jun 2018 08:56:29 -0400
Subject: [PATCH 368/610] Update __init__.py

Whitelist the operators module in the main library.
---
 tensorflow/contrib/autograph/__init__.py | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 3386c4eca4..c86f7e4ede 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -23,6 +23,7 @@ from __future__ import print_function
 
 # TODO(mdan): Bring only the relevant symbols to the top level.
 from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph import operators
 from tensorflow.contrib.autograph.impl.api import convert
 from tensorflow.contrib.autograph.impl.api import converted_call
 from tensorflow.contrib.autograph.impl.api import do_not_convert
@@ -33,8 +34,20 @@ from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
 
 _allowed_symbols = [
-    'utils', 'convert', 'converted_call', 'do_not_convert', 'RunMode',
-    'to_code', 'to_graph', 'AutographParseError'
+    # Main API
+    'RunMode',
+    'convert',
+    'converted_call',
+    'do_not_convert',
+    'to_code',
+    'to_graph',
+    # Special functions and overloaded operators
+    'operators',
+    'stack',
+    # Exceptions
+    'AutographParseError',
+    # Utilities: to be removed
+    'utils',
 ]
 
 remove_undocumented(__name__, _allowed_symbols)
-- 
GitLab


From a0527f3dd69fe5373db88914eb18cfab5ee3fceb Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 6 Jun 2018 09:51:06 -0700
Subject: [PATCH 369/610] Iteritems is deprecated in python 3. Using items
 instead.

---
 tensorflow/contrib/distribute/python/cross_tower_ops_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
index 2a26632608..b3bc0bac59 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -94,7 +94,7 @@ class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(type(left), type(right))
       self.assertEqual(left.devices, right.devices)
       if isinstance(list(left._index.values())[0], ops.IndexedSlices):
-        for (d, v) in left._index.iteritems():
+        for (d, v) in left._index.items():
           self._assert_indexed_slices_equal(v, right._index[d])
       elif context.executing_eagerly():
         self.assertEqual([v.numpy() for v in left._index.values()],
-- 
GitLab


From da264cf94af437679ae55ab5d41a085a8e3351ef Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Wed, 6 Jun 2018 09:58:23 -0700
Subject: [PATCH 370/610] Fix the bug in python3 the devices list in
 multi_worker_strategy becomes `dict_values`.

PiperOrigin-RevId: 199481384
---
 tensorflow/contrib/distribute/python/multi_worker_strategy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/distribute/python/multi_worker_strategy.py b/tensorflow/contrib/distribute/python/multi_worker_strategy.py
index a552b370eb..0f21a42732 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_strategy.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_strategy.py
@@ -121,7 +121,7 @@ class MultiWorkerMirroredStrategy(MirroredStrategy):
           worker: [device_util.canonicalize(worker, '/device:CPU:0')]
           for worker in self._workers
       }
-    self._devices = nest.flatten(self._worker_device_map.values())
+    self._devices = nest.flatten(self._worker_device_map)
 
     super(MultiWorkerMirroredStrategy, self).__init__(
         devices=self._devices, prefetch_on_device=prefetch_on_device)
-- 
GitLab


From 2d72b113979ad18b6b9299122f2f856e45d8505b Mon Sep 17 00:00:00 2001
From: An Jiaoyang <516756148@qq.com>
Date: Thu, 7 Jun 2018 01:29:11 +0800
Subject: [PATCH 371/610] Update backprop.py (#19804)

fixed this bug:
the gradient function returned by tfe.implicit_value_and_gradients() doesn't support keyword argument
---
 tensorflow/python/eager/backprop.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b2e6c60021..bd97b181ff 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -196,11 +196,11 @@ def implicit_val_and_grad(f):
   # TODO(cais): Remove calls to tf.constant() once the gradients functions
   # accept lists and np.ndarrays.
 
-  def grad_fn(*args):
+  def grad_fn(*args, **kwds):
     """Computes the gradient of the wrapped function."""
     this_tape = tape.push_new_tape()
     try:
-      end_node = f(*args)
+      end_node = f(*args, **kwds)
       if end_node is None:
         raise ValueError("Cannot differentiate a function that returns None; "
                          "did you forget to return a value from {}?".format(
-- 
GitLab


From 93cb963ed957fa6f061b3aced65dd04791970cb8 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 10:37:45 -0700
Subject: [PATCH 372/610] Fixes an error where a defun with no outputs crashes
 when called on inputs being taped.

PiperOrigin-RevId: 199488561
---
 tensorflow/python/eager/function.py      | 17 ++++++++++++++---
 tensorflow/python/eager/function_test.py | 17 +++++++++++++++++
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index 559063d6ae..03393bcd46 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -409,7 +409,15 @@ class GraphModeFunction(object):
         backward_outputs, in_gradients, output_shapes, attrs=self._attrs)
 
   def _backprop_call(self, args):
-    """Calls the wrapped function and records the result on a tape."""
+    """Calls the wrapped function and records the result on a tape.
+
+    (Only records results on a tape if the function has outputs)
+
+    Args:
+      args: The tensor inputs to the function.
+    Returns:
+      The call output.
+    """
     all_args = args + self._extra_inputs
     signature = self._forward_fdef.signature
     ctx = context.context()
@@ -420,6 +428,8 @@ class GraphModeFunction(object):
           inputs=all_args,
           attrs=None,
           ctx=ctx)
+      if not outputs:
+        return None
     else:
       g = ops.get_default_graph()
       g._add_function(self._forward_fdef)  # pylint: disable=protected-access
@@ -431,8 +441,9 @@ class GraphModeFunction(object):
           name="FunctionCall",
           compute_shapes=False)
       outputs = op.outputs
-      outputs = [outputs] if isinstance(
-          outputs, (ops.Tensor, type(None))) else list(outputs)
+      if not outputs:
+        return op
+      outputs = [outputs] if isinstance(outputs, ops.Tensor) else list(outputs)
       for i, s in enumerate(self._output_shapes):
         outputs[i].set_shape(s)
     real_outputs = outputs[:len(self._returns)]
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index f53d6c2608..cfdbe5f079 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -349,6 +349,23 @@ class FunctionTest(test.TestCase):
 
     g(constant_op.constant(1.0))
 
+  def testNestedDefunWithNoOutputAndTapedInput(self):
+    three = resource_variable_ops.ResourceVariable(3.0, name='v')
+
+    @function.defun
+    def f(x):
+      # This function intentionally takes a taped variable as input,
+      # but does not return any values
+      math_ops.add(x, three)
+
+    @function.defun
+    def g(x):
+      tape.watch_variable(x)
+      y = math_ops.add(x, three)
+      f(y)
+
+    g(three)
+
   def testGradientTensorConversionWithDefun(self):
     three = resource_variable_ops.ResourceVariable(3.0, name='v')
 
-- 
GitLab


From 26ebcd4093b01468f9945a70579559cadf1f7763 Mon Sep 17 00:00:00 2001
From: chengzhi chen <loongdna@gmail.com>
Date: Thu, 7 Jun 2018 01:42:28 +0800
Subject: [PATCH 373/610] TFLite: fix format mismatching warning. (#19796)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

format ‘%s’ expects a matching ‘char*’ argument.
---
 tensorflow/contrib/lite/examples/minimal/minimal.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/examples/minimal/minimal.cc b/tensorflow/contrib/lite/examples/minimal/minimal.cc
index 106e3b0270..8b0ace96cc 100644
--- a/tensorflow/contrib/lite/examples/minimal/minimal.cc
+++ b/tensorflow/contrib/lite/examples/minimal/minimal.cc
@@ -38,7 +38,7 @@ using namespace tflite;
 
 int main(int argc, char *argv[]) {
   if(argc != 2) {
-    fprintf(stderr, "Usage: %s <model>\n");
+    fprintf(stderr, "minimal <tflite model>\n");
     return 1;
   }
   const char* filename = argv[1];
-- 
GitLab


From 5621de9f7f6a9e7e4e5a50fbe7246ed630854aaa Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Wed, 6 Jun 2018 10:45:28 -0700
Subject: [PATCH 374/610] Add distributed all-reduce for multi-worker mirrored
 strategy.

PiperOrigin-RevId: 199489792
---
 tensorflow/contrib/distribute/python/BUILD    |   4 +
 .../contrib/distribute/python/combinations.py |  29 +++
 .../distribute/python/cross_tower_ops.py      | 221 +++++++++++++++---
 .../distribute/python/cross_tower_ops_test.py | 156 ++++++++-----
 .../distribute/python/cross_tower_utils.py    | 145 +++++++++++-
 5 files changed, 465 insertions(+), 90 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 3118deaa47..1f43a6eed5 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -148,6 +148,7 @@ py_library(
     ],
     deps = [
         ":mirrored_strategy",
+        ":multi_worker_strategy",
         ":one_device_strategy",
         ":tpu_strategy",
         "//tensorflow/contrib/optimizer_v2:training",
@@ -446,8 +447,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":values",
+        "//tensorflow/contrib/all_reduce:all_reduce_py",
         "//tensorflow/contrib/nccl:nccl_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
     ],
@@ -495,6 +498,7 @@ cuda_py_test(
     additional_deps = [
         ":combinations",
         ":cross_tower_ops",
+        ":multi_worker_test_base",
         ":values",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index 98e7228f24..ba03b14deb 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -47,6 +47,7 @@ from absl.testing import parameterized
 import six
 
 from tensorflow.contrib.distribute.python import mirrored_strategy as mirrored_lib
+from tensorflow.contrib.distribute.python import multi_worker_strategy
 from tensorflow.contrib.distribute.python import one_device_strategy as one_device_lib
 from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
@@ -338,6 +339,34 @@ mirrored_strategy_with_two_gpus = NamedDistribution(
         ["/gpu:0", "/gpu:1"], prefetch_on_device=False),
     required_gpus=2)
 
+multi_worker_strategy_with_cpu = NamedDistribution(
+    "MultiWorkerCPU",
+    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
+        cluster={
+            "worker": [
+                "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
+            ]
+        },
+        num_gpus_per_worker=0), 0)
+multi_worker_strategy_with_one_gpu = NamedDistribution(
+    "MultiWorker1GPU",
+    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
+        cluster={
+            "worker": [
+                "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
+            ]
+        },
+        num_gpus_per_worker=1), 1)
+multi_worker_strategy_with_two_gpus = NamedDistribution(
+    "MultiWorker2GPUs",
+    lambda: multi_worker_strategy.MultiWorkerMirroredStrategy(
+        cluster={
+            "worker": [
+                "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
+            ]
+        },
+        num_gpus_per_worker=2), 2)
+
 adam_optimizer_v1_fn = NamedObject(
     "AdamV1", lambda: adam.AdamOptimizer(0.2, epsilon=1))
 gradient_descent_optimizer_v1_fn = NamedObject(
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/contrib/distribute/python/cross_tower_ops.py
index a411b880e8..f8ae8b9712 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import six
 
 from tensorflow.contrib.distribute.python import cross_tower_utils
@@ -234,7 +235,13 @@ class ReductionToOneDeviceCrossTowerOps(CrossTowerOps):
 def _group_value_by_device(per_device_values):
   """Group values into sublists by their devices.
 
-  This grouping is needed to call the all-reduce library.
+  This grouping is needed to call the all-reduce library because it expects a
+  list of the following form:
+    [(grad0_gpu0, v0_gpu0), (grad1_gpu0, v1_gpu0), (grad2_gpu0, v2_gpu0) ...
+     (grad0_gpu1, v0_gpu1), (grad1_gpu1, v1_gpu1), (grad2_gpu1, v2_gpu1) ...
+     (grad0_gpu2, v0_gpu2), (grad1_gpu0, v1_gpu2), (grad2_gpu0, v2_gpu2) ...
+     ...
+    ]
 
   Args:
     per_device_values: a list of PerDevice obejcts.
@@ -322,7 +329,17 @@ class ConcatAndSplitPacker(object):
         # TODO(zhengxq): it is also possible to optimize away all the concat
         # as well.
         num_splits = self.num_packs
-        total_grad_size = array_ops.size(concat_grads)
+
+        # The array_ops.size function will sometimes remove static shapes. So if
+        # all gradient shapes are defined, we use another method to get the
+        # total size.
+        # TODO(yuefengz): move this logic to array_ops.size.
+        if all([g.shape.is_fully_defined() for g, _ in tower_grads_and_vars]):
+          total_grad_size = sum(
+              [g.shape.num_elements() for g, _ in tower_grads_and_vars])
+        else:
+          total_grad_size = array_ops.size(concat_grads)
+
         split_size = total_grad_size // num_splits
         split_size_last = total_grad_size - split_size * (num_splits - 1)
         split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
@@ -412,6 +429,31 @@ class AggregateSmallTensorPacker(object):
                                                   self.packing)
 
 
+def _pack_tensors(device_grads,
+                  num_packs=0,
+                  agg_small_grads_max_bytes=0,
+                  agg_small_grads_max_group=0):
+  """Pack tensors if specified."""
+  if num_packs > 0:
+    tensor_packer = ConcatAndSplitPacker(num_packs)
+    device_grad_packs = tensor_packer.pack(device_grads)
+  elif agg_small_grads_max_bytes > 0 and agg_small_grads_max_group > 0:
+    tensor_packer = AggregateSmallTensorPacker(agg_small_grads_max_bytes,
+                                               agg_small_grads_max_group)
+    device_grad_packs = tensor_packer.pack(device_grads)
+  else:
+    tensor_packer = None
+    device_grad_packs = device_grads
+  return device_grad_packs, tensor_packer
+
+
+def _unpack_tensors(reduced, tensor_packer=None):
+  """Unpack tensors if they are packed before all-reduce."""
+  if tensor_packer:
+    return tensor_packer.unpack(reduced)
+  return reduced
+
+
 class AllReduceCrossTowerOps(CrossTowerOps):
   """Reduction using all reduce."""
 
@@ -440,10 +482,10 @@ class AllReduceCrossTowerOps(CrossTowerOps):
       agg_small_grads_max_group: see above.
         tensors.
     """
-    self.all_reduce_alg = all_reduce_alg
-    self.num_packs = num_packs
-    self.agg_small_grads_max_bytes = agg_small_grads_max_bytes
-    self.agg_small_grads_max_group = agg_small_grads_max_group
+    self._all_reduce_alg = all_reduce_alg
+    self._num_packs = num_packs
+    self._agg_small_grads_max_bytes = agg_small_grads_max_bytes
+    self._agg_small_grads_max_group = agg_small_grads_max_group
     super(AllReduceCrossTowerOps, self).__init__()
 
   def _reduce(self, method_string, per_device_value, destinations):
@@ -485,37 +527,24 @@ class AllReduceCrossTowerOps(CrossTowerOps):
 
   def _batch_all_reduce(self, method_string, per_device_values):
     """All reduce algorithm in a batch."""
+    logging.info(
+        "batch_all_reduce invoked for batches size = %d with "
+        "algorithm = %s, num_packs = %d, agg_small_grads_max_bytes = %d and "
+        "agg_small_grads_max_group = %d", len(per_device_values),
+        self._all_reduce_alg, self._num_packs, self._agg_small_grads_max_bytes,
+        self._agg_small_grads_max_group)
     destinations = per_device_values[0].devices
     grouped = _group_value_by_device(per_device_values)
-    if self.num_packs > 0:
-      logging.info(
-          "batch_all_reduce invoked for batches size = %d with "
-          "algorithm = %s and num_packs = %d", len(per_device_values),
-          self.all_reduce_alg, self.num_packs)
-      tensor_packer = ConcatAndSplitPacker(self.num_packs)
-      device_grad_packs = tensor_packer.pack(grouped)
-    elif (self.agg_small_grads_max_bytes > 0 and
-          self.agg_small_grads_max_group > 0):
-      logging.info(
-          "batch_all_reduce invoked for batches size = %d with "
-          "algorithm = %s, agg_small_grads_max_bytes = %d and "
-          "agg_small_grads_max_group = %d", len(per_device_values),
-          self.all_reduce_alg, self.agg_small_grads_max_bytes,
-          self.agg_small_grads_max_group)
-      tensor_packer = AggregateSmallTensorPacker(
-          self.agg_small_grads_max_bytes, self.agg_small_grads_max_group)
-      device_grad_packs = tensor_packer.pack(grouped)
-    else:
-      logging.info(
-          "batch_all_reduce invoked for batches size = %d with algorithm = %s",
-          len(per_device_values), self.all_reduce_alg)
-      tensor_packer = None
-      device_grad_packs = grouped
+
+    device_grad_packs, self._tensor_packer = _pack_tensors(
+        grouped, self._num_packs, self._agg_small_grads_max_bytes,
+        self._agg_small_grads_max_group)
 
     # The actual aggregation of the repacked gradients. Note that they are
     # sharded among different aggregation trees. So it is important to strike
     # the balance on num_splits.
-    if self.all_reduce_alg == "nccl":
+    if self._all_reduce_alg == "nccl":
+      # TODO(yuefengz): merge this into the all-reduce library.
       reduced = cross_tower_utils.aggregate_gradients_using_nccl(
           device_grad_packs)
     else:
@@ -525,13 +554,137 @@ class AllReduceCrossTowerOps(CrossTowerOps):
           cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
               destinations, device_grad_packs))
 
-    if tensor_packer:
-      reduced = tensor_packer.unpack(reduced)
-
+    reduced = _unpack_tensors(reduced, self._tensor_packer)
     return _ungroup_and_make_mirrored(reduced, per_device_values[0].devices,
                                       method_string)
 
 
+AllReduceSpecTuple = collections.namedtuple("AllReduceSpecTuple",
+                                            "alg shards limit")
+
+
+class MultiWorkerAllReduce(AllReduceCrossTowerOps):
+  """All-reduce algorithms for distributed TensorFlow."""
+
+  def __init__(self,
+               worker_devices,
+               num_gpus_per_worker,
+               all_reduce_spec=("pscpu/pscpu", 2, -1),
+               num_packs=0,
+               agg_small_grads_max_bytes=0,
+               agg_small_grads_max_group=10):
+    """Initialize the all-reduce algorithm.
+
+    Args:
+      worker_devices: a list of device strings for workers participating in
+        all-reduce.
+      num_gpus_per_worker: number of GPU devices per worker.
+      all_reduce_spec: a tuple or a named tuple or a list of tuples specifying
+        the all-reduce algorithm.
+        1. The first element of a tuple is the name of the all-reduce algorithm.
+        Valid algorithm names are: "nccl", "nccl/xring", "nccl/rechd",
+        "nccl/pscpu", "xring", "pscpu", "psgpu", "pscpu/pscpu". Algorithms with
+        a "/" are hierarchical, so two all-reduces are executed, the first one
+        aggregates tensors within a worker and the second aggregates across
+        workers.
+        2. The second element of a tuple is the number of shards when doing
+        all-reduce. Let's say its values is M, each tensor after packing will be
+        split into M shards and then M parallel all-reduces would be performed
+        before finally they are concatenated backed into a complete tensor.
+        3. The third element is the maximum size of tensors that will be
+        applicable for the algorithm specified by the first element. For
+        example, if all_reduce_spec=[("nccl", 2, 1024), ("pscpu/pscpu", 2, -1)],
+        tensors with size not larger than 1024 bytes will be applied a 2-shard
+        "nccl" all-reduce and other tensors will be applied a 2-shard
+        "pscpu/pscpu" algorithm. The third elements should be in increasing
+        order across tuples and end with -1 which indicates infinity.
+      num_packs: see AllReduceCrossTowerOps.
+      agg_small_grads_max_bytes: see AllReduceCrossTowerOps.
+      agg_small_grads_max_group: see AllReduceCrossTowerOps.
+    """
+    self._worker_devices = worker_devices
+    self._num_gpus_per_worker = num_gpus_per_worker
+    super(MultiWorkerAllReduce, self).__init__(
+        num_packs=num_packs,
+        agg_small_grads_max_bytes=agg_small_grads_max_bytes,
+        agg_small_grads_max_group=agg_small_grads_max_group)
+
+    def validate_and_complete_spec(spec):
+      """Validate and complete the all-reduce spec."""
+      # TODO(yuefengz): support namedtuple.
+      if not isinstance(spec, tuple):
+        raise ValueError(
+            "A tuple is expected for all-reduce spec: %r" % all_reduce_spec)
+      if not spec or len(spec) > 3:
+        raise ValueError(
+            "Too many elements in the all-reduce spec tuple: %r" % spec)
+      if len(spec) == 1:
+        return AllReduceSpecTuple(spec[0], 1, -1)
+      elif len(spec) == 2:
+        return AllReduceSpecTuple(spec[0], spec[1], -1)
+      else:
+        return AllReduceSpecTuple(*spec)
+
+    self._all_reduce_spec = []
+    if isinstance(all_reduce_spec, six.string_types):
+      self._all_reduce_spec.append(AllReduceSpecTuple(all_reduce_spec, 1, -1))
+    elif isinstance(all_reduce_spec, tuple):
+      self._all_reduce_spec.append(validate_and_complete_spec(all_reduce_spec))
+    elif isinstance(all_reduce_spec, list):
+      self._all_reduce_spec = [
+          validate_and_complete_spec(spec) for spec in all_reduce_spec
+      ]
+
+  def _batch_all_reduce(self, method_string, per_device_values):
+    """All reduce algorithm in a batch."""
+    logging.info(
+        "distributed batch_all_reduce invoked for batches size = %d with "
+        "allreduce_spec = %r, num_packs = %d, agg_small_grads_max_bytes = %d "
+        "and agg_small_grads_max_group = %d", len(per_device_values),
+        self._all_reduce_spec, self._num_packs, self._agg_small_grads_max_bytes,
+        self._agg_small_grads_max_group)
+
+    destinations = sorted(per_device_values[0].devices)
+    device_grads = _group_value_by_device(per_device_values)
+
+    # The all reduce library requires fully defined shapes.
+    # TODO(yuefengz): when tensor sharding is not needed, static shapes are not
+    # required as well.
+    for device_grad in device_grads:
+      for grad, _ in device_grad:
+        if not grad.shape.is_fully_defined():
+          raise ValueError("Shape is unknown for node %r" % grad)
+
+    remaining_grads = device_grads
+    aggregated_grads = []
+    for spec_tuple in self._all_reduce_spec:
+      if spec_tuple.limit < 0:
+        this_grads = remaining_grads
+        remaining_grads = []
+      else:
+        (this_grads, remaining_grads) = cross_tower_utils.split_grads_by_size(
+            spec_tuple.limit, remaining_grads)
+      if this_grads:
+        device_grad_packs, self._tensor_packer = _pack_tensors(
+            this_grads, self._num_packs, self._agg_small_grads_max_bytes,
+            self._agg_small_grads_max_group)
+        range_agg_grads = cross_tower_utils.sum_gradients_all_reduce(
+            self._worker_devices, device_grad_packs, len(self._worker_devices),
+            spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
+        range_agg_grads = _unpack_tensors(range_agg_grads, self._tensor_packer)
+
+        if not aggregated_grads:
+          aggregated_grads = range_agg_grads
+        else:
+          assert len(aggregated_grads) == len(range_agg_grads)
+          for i in range(len(aggregated_grads)):
+            aggregated_grads[i] += range_agg_grads[i]
+    assert not remaining_grads
+
+    return _ungroup_and_make_mirrored(aggregated_grads, destinations,
+                                      method_string)
+
+
 _dgx1_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
 
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
index 2a26632608..fed5505d92 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
@@ -24,6 +24,7 @@ from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
@@ -75,7 +76,7 @@ def _make_mirrored_indexed_slices(devices, values, indices, dense_shape):
 _cpu_device = "/device:CPU:0"
 
 
-class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
+class CrossTowerOpsTestBase(test.TestCase, parameterized.TestCase):
 
   def _assert_indexed_slices_equal(self, left, right):
     self.assertIsInstance(left, ops.IndexedSlices)
@@ -94,7 +95,7 @@ class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(type(left), type(right))
       self.assertEqual(left.devices, right.devices)
       if isinstance(list(left._index.values())[0], ops.IndexedSlices):
-        for (d, v) in left._index.iteritems():
+        for (d, v) in left._index.items():
           self._assert_indexed_slices_equal(v, right._index[d])
       elif context.executing_eagerly():
         self.assertEqual([v.numpy() for v in left._index.values()],
@@ -104,51 +105,7 @@ class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
           self.assertEqual(
               sess.run(list(left._index.values())), list(right._index.values()))
 
-  # TODO(yuefengz): decouple the num_gpus check from distribution in
-  # combinations module so that we can pass in devices instead of a distribution
-  # strategy.
-  reduction_to_one_combinations = combinations.combine(
-      cross_tower_ops=[
-          combinations.NamedObject(
-              "DefaultReductionToOneDeviceCrossTowerOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
-          combinations.NamedObject(
-              "ReductionToCPUDeviceCrossTowerOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
-                  reduce_to_device=_cpu_device)),
-          combinations.NamedObject(
-              "AccumulateNCrossTowerOp",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
-                  accumulation_fn=math_ops.accumulate_n)),
-      ],
-      distribution=[
-          combinations.one_device_strategy,
-          combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus
-      ],
-      mode=["graph", "eager"])
-  allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
-          combinations.NamedObject(
-              "AllReduce",
-              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 1, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopy",
-              cross_tower_ops_lib.AllReduceCrossTowerOps(
-                  "hierarchical_copy", 8, 0, 0)),
-          combinations.NamedObject(
-              "AllReduceNoGradientRepacking",
-              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 0, 0, 0)),
-          combinations.NamedObject(
-              "HierarchicalCopyAggregateSmallTensors",
-              cross_tower_ops_lib.AllReduceCrossTowerOps(
-                  "hierarchical_copy", 0, 100, 10))
-      ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
-      mode=["graph", "eager"])
-
-  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def _testReductionAndBroadcast(self, cross_tower_ops, distribution):
     devices = distribution.worker_devices
 
     values = [constant_op.constant(float(d)) for d in range(len(devices))]
@@ -208,20 +165,70 @@ class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
             cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
             _fake_mirrored(1., destinations))
 
+
+class SingleWorkerCrossTowerOpsTest(CrossTowerOpsTestBase):
+  # TODO(yuefengz): decouple the num_gpus check from distribution in
+  # combinations module so that we can pass in devices instead of a distribution
+  # strategy.
+  reduction_to_one_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "DefaultReductionToOneDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps()),
+          combinations.NamedObject(
+              "ReductionToCPUDeviceCrossTowerOps",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  reduce_to_device=_cpu_device)),
+          combinations.NamedObject(
+              "AccumulateNCrossTowerOp",
+              cross_tower_ops_lib.ReductionToOneDeviceCrossTowerOps(
+                  accumulation_fn=math_ops.accumulate_n)),
+      ],
+      distribution=[
+          combinations.one_device_strategy,
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.mirrored_strategy_with_two_gpus
+      ],
+      mode=["graph", "eager"])
+  allreduce_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "AllReduce",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 1, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopy",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 8, 0, 0)),
+          combinations.NamedObject(
+              "AllReduceNoGradientRepacking",
+              cross_tower_ops_lib.AllReduceCrossTowerOps("nccl", 0, 0, 0)),
+          combinations.NamedObject(
+              "HierarchicalCopyAggregateSmallTensors",
+              cross_tower_ops_lib.AllReduceCrossTowerOps(
+                  "hierarchical_copy", 0, 100, 10))
+      ],
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=["graph", "eager"])
+
+  @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+    with distribution.scope():
+      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+
   def testChooseAlgorithm(self):
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                     [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
     self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
-    self.assertEqual(result.num_packs, 8)
+    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result._num_packs, 8)
 
     # if there are only 4 devices
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
     self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result.all_reduce_alg, "nccl")
-    self.assertEqual(result.num_packs, 1)
+    self.assertEqual(result._all_reduce_alg, "nccl")
+    self.assertEqual(result._num_packs, 1)
 
     # if devices links contain each device itself
     device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
@@ -229,16 +236,16 @@ class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
                     [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
     self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result.all_reduce_alg, "hierarchical_copy")
-    self.assertEqual(result.num_packs, 8)
+    self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
+    self.assertEqual(result._num_packs, 8)
 
     # if not dgx1-like links
     device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                     [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
     result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
     self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossTowerOps)
-    self.assertEqual(result.all_reduce_alg, "nccl")
-    self.assertEqual(result.num_packs, 1)
+    self.assertEqual(result._all_reduce_alg, "nccl")
+    self.assertEqual(result._num_packs, 1)
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
@@ -316,5 +323,44 @@ class CrossTowerOpsTest(test.TestCase, parameterized.TestCase):
     self._assert_values_equal(total_mirrored_without_dups, result)
 
 
+class MultiWorkerCrossTowerOpsTest(multi_worker_test_base.MultiWorkerTestBase,
+                                   CrossTowerOpsTestBase):
+
+  worker_devices = [
+      "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
+  ]
+  multi_worker_allreduce_combinations = combinations.combine(
+      cross_tower_ops=[
+          combinations.NamedObject(
+              "MultiWorkerAllReduce",
+              cross_tower_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
+          combinations.NamedObject(
+              "MultiWorkerAllReducePack",
+              cross_tower_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
+          combinations.NamedObject(
+              "MultiWorkerAllReduceAggregation",
+              cross_tower_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
+          combinations.NamedObject(
+              "MultiWorkerAllReduceMultipleSpecs",
+              cross_tower_ops_lib.MultiWorkerAllReduce(
+                  worker_devices, 2, [("pscpu/pscpu", 2, 100),
+                                      ("xring", 2, -1)], 0, 0, 0)),
+      ],
+      distribution=[
+          combinations.multi_worker_strategy_with_cpu,
+          combinations.multi_worker_strategy_with_one_gpu,
+          combinations.multi_worker_strategy_with_two_gpus
+      ],
+      mode=["graph"])
+
+  @combinations.generate(multi_worker_allreduce_combinations)
+  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+    with distribution.scope():
+      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/contrib/distribute/python/cross_tower_utils.py
index 137fabf4c7..2bb088e704 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/contrib/distribute/python/cross_tower_utils.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import collections as pycoll
 
 from tensorflow.contrib import nccl
+from tensorflow.contrib.all_reduce.python import all_reduce
 from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -158,6 +159,148 @@ def aggregate_single_gradient_using_copy(grad_and_vars, use_mean,
     return (grad, v), None
 
 
+def group_device_names(devices, group_size):
+  """Group device names into groups of group_size.
+
+  Args:
+    devices: a list of canonical device strings.
+    group_size: integer which is equal to or greater than 1.
+
+  Returns:
+    list of lists of devices, where each inner list is group_size long,
+      and each device appears at least once in an inner list.  If
+      len(devices) % group_size == 0 then each device will appear exactly once.
+
+  Raises:
+    ValueError: if group_size > len(devices)
+  """
+  num_devices = len(devices)
+  if group_size > num_devices:
+    raise ValueError(
+        'only %d devices, but group_size=%d' % (num_devices, group_size))
+  num_groups = (
+      num_devices // group_size + (1 if (num_devices % group_size != 0) else 0))
+  groups = [[] for i in range(num_groups)]
+  for i in range(num_groups * group_size):
+    groups[i % num_groups].append(devices[i % num_devices])
+  return groups
+
+
+def split_grads_by_size(threshold_size, device_grads):
+  """Break gradients into two sets according to tensor size.
+
+  Args:
+    threshold_size: int size cutoff for small vs large tensor.
+    device_grads: List of lists of (gradient, variable) tuples.  The outer
+        list is over devices. The inner list is over individual gradients.
+
+  Returns:
+    small_grads: Subset of device_grads where shape is <= threshold_size
+       elements.
+    large_grads: Subset of device_grads where shape is > threshold_size
+       elements.
+  """
+  small_grads = []
+  large_grads = []
+  for dl in device_grads:
+    small_dl = []
+    large_dl = []
+    for (g, v) in dl:
+      tensor_size = g.get_shape().num_elements()
+      if tensor_size <= threshold_size:
+        small_dl.append([g, v])
+      else:
+        large_dl.append([g, v])
+    if small_dl:
+      small_grads.append(small_dl)
+    if large_dl:
+      large_grads.append(large_dl)
+  return small_grads, large_grads
+
+
+def sum_grad_and_var_all_reduce(grad_and_vars,
+                                num_workers,
+                                alg,
+                                gpu_indices,
+                                aux_devices=None,
+                                num_shards=1):
+  """Apply all-reduce algorithm over specified gradient tensors."""
+  with ops.name_scope('allreduce'):
+    # Note that each grad_and_vars looks like the following:
+    #   ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
+    scaled_grads = [g for g, _ in grad_and_vars]
+    if alg == 'nccl':
+      summed_grads = nccl.all_sum(scaled_grads)
+    elif alg == 'xring':
+      summed_grads = all_reduce.build_ring_all_reduce(
+          scaled_grads, num_workers, num_shards, gpu_indices, math_ops.add)
+    elif alg == 'nccl/xring':
+      summed_grads = all_reduce.build_nccl_then_ring(scaled_grads, num_shards,
+                                                     math_ops.add)
+    elif alg == 'nccl/rechd':
+      summed_grads = all_reduce.build_nccl_then_recursive_hd(
+          scaled_grads, math_ops.add)
+    elif alg == 'nccl/pscpu':
+      summed_grads = all_reduce.build_nccl_then_shuffle(
+          scaled_grads, aux_devices, math_ops.add, math_ops.add_n)
+    elif alg == 'pscpu/pscpu':
+      second_gather_devices = aux_devices[:num_shards]
+      summed_grads = all_reduce.build_shuffle_then_shuffle(
+          scaled_grads, aux_devices, second_gather_devices, math_ops.add_n)
+    elif alg in ['pscpu', 'psgpu']:
+      summed_grads = all_reduce.build_shuffle_all_reduce(
+          scaled_grads, aux_devices, math_ops.add_n)
+    else:
+      raise ValueError('unsupported all_reduce alg: ', alg)
+
+    result = []
+    for (_, v), g in zip(grad_and_vars, summed_grads):
+      result.append([g, v])
+    return result
+
+
+def sum_gradients_all_reduce(dev_prefixes, tower_grads, num_workers, alg,
+                             num_shards, gpu_indices):
+  """Apply all-reduce algorithm over specified gradient tensors.
+
+  Args:
+    dev_prefixes: list of prefix strings to use to generate PS device names.
+    tower_grads: the gradients to reduce.
+    num_workers: number of worker processes across entire job.
+    alg: the all-reduce algorithm to apply.
+    num_shards: alg-specific sharding factor.
+    gpu_indices: indices of local GPUs in order usable for ring-reduce.
+
+  Returns:
+    list of reduced tensors
+  """
+  alg_contains_shuffle = any([n in alg for n in ['pscpu', 'psgpu']])
+  is_hierarchical = '/' in alg
+  if 'pscpu' in alg:
+    aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
+  elif 'psgpu' in alg:
+    aux_devices = [
+        prefix + '/gpu:%d' % i
+        for i in range(len(gpu_indices))
+        for prefix in dev_prefixes
+    ]
+  else:
+    aux_devices = ['/job:localhost/cpu:0']
+  # Auxiliary devices for hierarchical all-reduces.
+  aux_device_groups = group_device_names(
+      aux_devices, num_shards if alg_contains_shuffle else 1)
+  group_index = 0
+  reduced_gv_list = []
+  for grad_and_vars in zip(*tower_grads):
+    reduced_gv_list.append(
+        sum_grad_and_var_all_reduce(
+            grad_and_vars, num_workers, alg, gpu_indices, aux_devices
+            if is_hierarchical else aux_device_groups[group_index], num_shards))
+    group_index = (group_index + 1) % len(aux_device_groups)
+  new_tower_grads = [list(x) for x in zip(*reduced_gv_list)]
+  return new_tower_grads
+
+
 def extract_ranges(index_list, range_size_limit=32):
   """Extract consecutive ranges and singles from index_list.
 
@@ -330,7 +473,7 @@ def unpack_small_tensors(tower_grads, packing):
   for dev_idx, gv_list in enumerate(tower_grads):
     gv_list = list(gv_list)
     new_gv_list = gv_list[num_packed:]
-    for i in xrange(0, num_packed):
+    for i in range(num_packed):
       k = '%d:%d' % (dev_idx, i)
       gpt = packing[k]
       gv = unpack_grad_tuple(gv_list[i], gpt)
-- 
GitLab


From 980c390941853649bb56c4940a46f474eb97ed80 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Wed, 6 Jun 2018 11:05:17 -0700
Subject: [PATCH 375/610] Misc fixes.

PiperOrigin-RevId: 199493360
---
 tensorflow/contrib/lite/tools/benchmark/BUILD               | 2 ++
 .../contrib/lite/tools/benchmark/command_line_flags_test.cc | 6 +++---
 tensorflow/contrib/lite/tools/verifier_test.cc              | 6 +++---
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index 4824a4dbde..c5aa27d07c 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -5,6 +5,7 @@ package(default_visibility = [
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
 
 common_copts = ["-Wall"]
 
@@ -58,6 +59,7 @@ cc_library(
     ],
     hdrs = ["benchmark_tflite_model.h"],
     copts = common_copts,
+    linkopts = tflite_linkopts(),
     deps = [
         ":benchmark_model_lib",
         "//tensorflow/contrib/lite:framework",
diff --git a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
index 9a931d5ddd..620d61b027 100644
--- a/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/command_line_flags_test.cc
@@ -134,9 +134,9 @@ TEST(CommandLineFlagsTest, UsageString) {
   std::string some_name = "something";
   // Don't test float in this case, because precision is hard to predict and
   // match against, and we don't want a flakey test.
-  const string tool_name = "some_tool_name";
-  string usage = Flags::Usage(tool_name + " <flags>",
-                              {Flag("some_int", &some_int, "some int"),
+  const std::string tool_name = "some_tool_name";
+  std::string usage = Flags::Usage(
+      tool_name + " <flags>", {Flag("some_int", &some_int, "some int"),
                                Flag("some_int64", &some_int64, "some int64"),
                                Flag("some_switch", &some_switch, "some switch"),
                                Flag("some_name", &some_name, "some name")});
diff --git a/tensorflow/contrib/lite/tools/verifier_test.cc b/tensorflow/contrib/lite/tools/verifier_test.cc
index ce8a7857d2..ad7d59ecb4 100644
--- a/tensorflow/contrib/lite/tools/verifier_test.cc
+++ b/tensorflow/contrib/lite/tools/verifier_test.cc
@@ -41,7 +41,7 @@ class TfLiteFlatbufferModelBuilder {
   }
 
   TfLiteFlatbufferModelBuilder(const std::vector<BuiltinOperator>& builtin_ops,
-                               const std::vector<string>& custom_ops) {
+                               const std::vector<std::string>& custom_ops) {
     buffers_.push_back(
         CreateBuffer(builder_, builder_.CreateVector(std::vector<uint8_t>{})));
 
@@ -194,8 +194,8 @@ TEST(VerifyModel, TensorBufferIsNotValid) {
                       /*operators=*/0, builder.CreateString("Main"))});
 
   auto buffers = builder.CreateVector(std::vector<Offset<Buffer>>{
-      CreateBuffer(builder,
-                   builder.CreateVector(std::vector<uint8>{1, 2, 3, 4, 5, 6})),
+      CreateBuffer(builder, builder.CreateVector(
+                                std::vector<uint8_t>{1, 2, 3, 4, 5, 6})),
   });
 
   auto model = CreateModel(builder, TFLITE_SCHEMA_VERSION, /*operator_codes=*/0,
-- 
GitLab


From 879fc3440495d9388754cb7d1878caf034d03d61 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Wed, 6 Jun 2018 11:26:43 -0700
Subject: [PATCH 376/610] Use memmove instead of memcpy for the large tensors
 on Linux.

Issue: #17246

~1.7x speedup for fetching a variable

Before:
  fetch_cpu_variable  : 5.5 GB/sec, min: 14.56, median: 15.05, mean: 15.14
  fetch_cpu_variable_add: 11.0 GB/sec, min: 7.29, median: 12.03, mean: 12.56
  fetch_cpu_variable_concat: 11.6 GB/sec, min: 6.92, median: 13.78, mean: 14.76

After:
  fetch_cpu_variable  : 9.2 GB/sec, min: 8.71, median: 8.79, mean: 8.80
  fetch_cpu_variable_add: 12.5 GB/sec, min: 6.41, median: 7.20, mean: 7.51
  fetch_cpu_variable_concat: 12.7 GB/sec, min: 6.32, median: 6.54
PiperOrigin-RevId: 199497691
---
 tensorflow/python/lib/core/ndarray_tensor.cc | 38 ++++++++++++++++++--
 1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 9df38d464c..2acab92764 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -312,6 +312,40 @@ Status GetPyArrayDescrForTensor(const TF_Tensor* tensor,
 
   return Status::OK();
 }
+
+inline void FastMemcpy(void* dst, const void* src, size_t size) {
+  // clang-format off
+  switch (size) {
+    // Most compilers will generate inline code for fixed sizes,
+    // which is significantly faster for small copies.
+    case  1: memcpy(dst, src, 1); break;
+    case  2: memcpy(dst, src, 2); break;
+    case  3: memcpy(dst, src, 3); break;
+    case  4: memcpy(dst, src, 4); break;
+    case  5: memcpy(dst, src, 5); break;
+    case  6: memcpy(dst, src, 6); break;
+    case  7: memcpy(dst, src, 7); break;
+    case  8: memcpy(dst, src, 8); break;
+    case  9: memcpy(dst, src, 9); break;
+    case 10: memcpy(dst, src, 10); break;
+    case 11: memcpy(dst, src, 11); break;
+    case 12: memcpy(dst, src, 12); break;
+    case 13: memcpy(dst, src, 13); break;
+    case 14: memcpy(dst, src, 14); break;
+    case 15: memcpy(dst, src, 15); break;
+    case 16: memcpy(dst, src, 16); break;
+#if defined(PLATFORM_GOOGLE) || defined(PLATFORM_POSIX) && \
+    !defined(IS_MOBILE_PLATFORM)
+    // On Linux, memmove appears to be faster than memcpy for
+    // large sizes, strangely enough.
+    default: memmove(dst, src, size); break;
+#else
+    default: memcpy(dst, src, size); break;
+#endif
+  }
+  // clang-format on
+}
+
 }  // namespace
 
 // Converts the given TF_Tensor to a numpy ndarray.
@@ -362,8 +396,8 @@ Status TF_TensorToPyArray(Safe_TF_TensorPtr tensor, PyObject** out_ndarray) {
                             " bytes but TF_Tensor was ",
                             TF_TensorByteSize(tensor.get()), " bytes");
   } else {
-    memcpy(PyArray_DATA(py_array), TF_TensorData(tensor.get()),
-           PyArray_NBYTES(py_array));
+    FastMemcpy(PyArray_DATA(py_array), TF_TensorData(tensor.get()),
+               PyArray_NBYTES(py_array));
   }
 
   // PyArray_Return turns rank 0 arrays into numpy scalars
-- 
GitLab


From 6aeb1fdc53fb2a7df61e2544ce92243b6b43ad02 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Wed, 6 Jun 2018 11:27:25 -0700
Subject: [PATCH 377/610] [XLA:GPU] Allow intermediate outputs for reduce input
 fusions.

This generalizes the emitter to allow pretty much arbitrary multi-output fusion
as long as the shapes match the input of the reduce(s). The idea is that
multi-output fusion can move intermediate inputs into the same fusion so they
don't have to be re-read by the reduce.

PiperOrigin-RevId: 199497832
---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 154 ++++++++++++------
 .../xla/service/gpu/ir_emitter_unnested.h     |  35 +++-
 .../xla/tests/multioutput_fusion_test.cc      | 100 ++++++++++++
 3 files changed, 233 insertions(+), 56 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index b40b557cab..06fc3f8eea 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -501,20 +501,27 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
       case HloOpcode::kReduce: {
         VLOG(3) << "Emitting fused reduction to vector: " << fusion->ToString();
         std::vector<std::unique_ptr<Thunk>> thunks;
-        ArraySlice<HloInstruction*> reduces =
+        ArraySlice<HloInstruction*> output_instructions =
             root->opcode() == HloOpcode::kTuple
                 ? root->operands()
                 : ArraySlice<HloInstruction*>(&root, 1);
 
         // For multi-output fusion emit an initializer for each tuple element.
         // Otherwise it's sufficient to just initialize the single output.
-        for (int i = 0, e = reduces.size(); i != e; ++i) {
-          TF_ASSIGN_OR_RETURN(
-              std::unique_ptr<Thunk> initializer_thunk,
-              BuildInitializerThunk(
-                  fusion, reduces[i] == root ? ShapeIndex() : ShapeIndex({i})));
-          thunks.push_back(std::move(initializer_thunk));
+        HloInstruction* first_reduce = nullptr;
+        for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+          if (output_instructions[i]->opcode() == HloOpcode::kReduce) {
+            TF_ASSIGN_OR_RETURN(
+                std::unique_ptr<Thunk> initializer_thunk,
+                BuildInitializerThunk(fusion, output_instructions[i] == root
+                                                  ? ShapeIndex()
+                                                  : ShapeIndex({i})));
+            thunks.push_back(std::move(initializer_thunk));
+            first_reduce =
+                first_reduce == nullptr ? output_instructions[i] : first_reduce;
+          }
         }
+        CHECK(first_reduce != nullptr);
         thunks.push_back(BuildKernelThunk(fusion));
         thunk_sequence_->emplace_back(
             MakeUnique<SequentialThunk>(std::move(thunks), fusion));
@@ -533,29 +540,45 @@ Status IrEmitterUnnested::HandleFusion(HloInstruction* fusion) {
         // fusion is a special case of that.
         InlinedVector<llvm_ir::ElementGenerator, 1> input_gens;
         InlinedVector<llvm_ir::ElementGenerator, 1> init_value_gens;
+        std::vector<std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+            extra_output_gens;
         InlinedVector<HloComputation*, 1> reducers;
-        for (const HloInstruction* reduce : reduces) {
-          CHECK_EQ(HloOpcode::kReduce, reduce->opcode());
+        InlinedVector<ShapeIndex, 1> reduce_output_shapes;
+        for (int i = 0, e = output_instructions.size(); i != e; ++i) {
+          const HloInstruction* inst = output_instructions[i];
+          ShapeIndex output_shape_index;
+          if (root->opcode() == HloOpcode::kTuple) {
+            output_shape_index = {i};
+          }
           // TODO(kramerb): CHECK that layouts are equal. Currently this
           // breaks multioutputfusion_test. The test has pre-fused
           // instructions, but layout_assignment will not assign any layouts
           // for instructions inside of a fused computation. It just removes
           // the layouts instead.
-          CHECK(ShapeUtil::Compatible(reduces[0]->shape(), reduce->shape()));
-          CHECK(ShapeUtil::Compatible(reduces[0]->operand(0)->shape(),
-                                      reduce->operand(0)->shape()));
-          CHECK(ShapeUtil::Compatible(reduces[0]->operand(1)->shape(),
-                                      reduce->operand(1)->shape()));
-          CHECK(reduces[0]->dimensions() == reduce->dimensions());
-          input_gens.push_back(fused_emitter.GetGenerator(reduce->operand(0)));
-          init_value_gens.push_back(
-              fused_emitter.GetGenerator(reduce->operand(1)));
-          reducers.push_back(reduce->to_apply());
+          if (inst->opcode() == HloOpcode::kReduce) {
+            CHECK(ShapeUtil::Compatible(first_reduce->shape(), inst->shape()));
+            CHECK(ShapeUtil::Compatible(first_reduce->operand(0)->shape(),
+                                        inst->operand(0)->shape()));
+            CHECK(ShapeUtil::Compatible(first_reduce->operand(1)->shape(),
+                                        inst->operand(1)->shape()));
+            CHECK(first_reduce->dimensions() == inst->dimensions());
+            input_gens.push_back(fused_emitter.GetGenerator(inst->operand(0)));
+            init_value_gens.push_back(
+                fused_emitter.GetGenerator(inst->operand(1)));
+            reducers.push_back(inst->to_apply());
+            reduce_output_shapes.push_back(std::move(output_shape_index));
+          } else {
+            CHECK(ShapeUtil::Compatible(first_reduce->operand(0)->shape(),
+                                        inst->shape()));
+            extra_output_gens.emplace_back(fused_emitter.GetGenerator(inst),
+                                           std::move(output_shape_index));
+          }
         }
-        const Shape& input_shape = reduces[0]->operand(0)->shape();
-        return EmitReductionToVector(reduces[0], input_shape, input_gens,
-                                     init_value_gens, reduces[0]->dimensions(),
-                                     reducers);
+        const Shape& input_shape = first_reduce->operand(0)->shape();
+        return EmitReductionToVector(first_reduce, input_shape, input_gens,
+                                     init_value_gens,
+                                     first_reduce->dimensions(), reducers,
+                                     reduce_output_shapes, extra_output_gens);
       }
       default:
         LOG(FATAL) << "Bad opcode for input fusion: "
@@ -940,11 +963,33 @@ Status IrEmitterUnnested::HandleCopy(HloInstruction* copy) {
   return IrEmitter::HandleCopy(copy);
 }
 
+Status IrEmitterUnnested::EmitExtraOutputsForReduce(
+    const HloInstruction* reduce, const llvm_ir::IrArray::Index& index,
+    tensorflow::gtl::ArraySlice<
+        std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+        extra_output_gens) {
+  for (int i = 0; i != extra_output_gens.size(); ++i) {
+    const HloInstruction* output = reduce->parent()->FusionInstruction();
+    llvm::Value* extra_output_address =
+        GetIrArray(*output, *output, extra_output_gens[i].second)
+            .EmitArrayElementAddress(index, &ir_builder_,
+                                     "extra_output_element_address");
+    TF_ASSIGN_OR_RETURN(llvm::Value* const extra_output_ir_value,
+                        extra_output_gens[i].first(index));
+    ir_builder_.CreateStore(extra_output_ir_value, extra_output_address);
+  }
+  return Status::OK();
+}
+
 Status IrEmitterUnnested::EmitReductionToScalar(
     HloInstruction* reduce, const Shape& input_shape,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
-    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+    tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+    tensorflow::gtl::ArraySlice<
+        std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+        extra_output_gens) {
   // Number of elements processed by a single thread.
   constexpr int64 kTileSize = 16;
   int64 num_elems = ShapeUtil::ElementsIn(input_shape);
@@ -1050,7 +1095,7 @@ Status IrEmitterUnnested::EmitReductionToScalar(
             {partial_reduction_result_addresses[i], input_address},
             partial_reduction_result_addresses[i]));
       }
-      return Status::OK();
+      return EmitExtraOutputsForReduce(reduce, input_index, extra_output_gens);
     };
 
     // x_end = kTileSize + x_in_tiles * kTileSize, i.e., the location that's
@@ -1120,17 +1165,13 @@ Status IrEmitterUnnested::EmitReductionToScalar(
                                    &ir_builder_);
 
     for (int i = 0; i != num_reduces; ++i) {
-      ShapeIndex output_shape_index;
-      if (output->IsMultiOutputFusion()) {
-        output_shape_index = {i};
-      }
       llvm::Value* output_address =
-          GetIrArray(*output, *output, output_shape_index)
+          GetIrArray(*output, *output, reduce_output_shapes[i])
               .EmitArrayElementAddress(
                   llvm_ir::IrArray::Index(
                       /*linear=*/ir_builder_.getInt64(0),
                       ShapeUtil::GetSubshape(output->shape(),
-                                             output_shape_index),
+                                             reduce_output_shapes[i]),
                       &ir_builder_),
                   &ir_builder_, "output_element_address");
       TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
@@ -1158,7 +1199,11 @@ Status IrEmitterUnnested::EmitColumnReduction(
     int64 height, int64 width, HloInstruction* reduce, const Shape& input_shape,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
-    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+    tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+    tensorflow::gtl::ArraySlice<
+        std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+        extra_output_gens) {
   // Divide the input matrix into tiles of size Kx1. For example, when the
   // input matrix is 4x4 and K=2, the tiled matrix looks like
   //
@@ -1284,7 +1329,8 @@ Status IrEmitterUnnested::EmitColumnReduction(
               {partial_reduction_result_addresses[i], input_address},
               partial_reduction_result_addresses[i]));
         }
-        return Status::OK();
+        return EmitExtraOutputsForReduce(reduce, input_index,
+                                         extra_output_gens);
       }
     };
 
@@ -1315,17 +1361,13 @@ Status IrEmitterUnnested::EmitColumnReduction(
     const HloInstruction* output =
         reduce->IsFused() ? reduce->parent()->FusionInstruction() : reduce;
     for (int i = 0; i != num_reduces; ++i) {
-      ShapeIndex output_shape_index;
-      if (output->IsMultiOutputFusion()) {
-        output_shape_index = {i};
-      }
       llvm::Value* output_address =
-          GetIrArray(*output, *output, output_shape_index)
+          GetIrArray(*output, *output, reduce_output_shapes[i])
               .EmitArrayElementAddress(
                   llvm_ir::IrArray::Index(
                       x,
                       ShapeUtil::GetSubshape(output->shape(),
-                                             output_shape_index),
+                                             reduce_output_shapes[i]),
                       &ir_builder_),
                   &ir_builder_, "output_element_address");
       TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
@@ -1354,7 +1396,11 @@ Status IrEmitterUnnested::EmitRowReduction(
     const Shape& input_shape,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
-    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+    tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+    tensorflow::gtl::ArraySlice<
+        std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+        extra_output_gens) {
   // A naive algorithm is:
   // 1. Divide the input tensor into tiles of size 1x1xK.
   // 2. Partially reduces each tile to a scalar using one thread.
@@ -1549,7 +1595,8 @@ Status IrEmitterUnnested::EmitRowReduction(
               {partial_reduction_result_addresses[i], input_address},
               partial_reduction_result_addresses[i]));
         }
-        return Status::OK();
+        return EmitExtraOutputsForReduce(reduce, input_index,
+                                         extra_output_gens);
       }
     };
 
@@ -1610,17 +1657,13 @@ Status IrEmitterUnnested::EmitRowReduction(
     llvm_ir::SetToFirstInsertPoint(if_lane_id_is_zero_data.true_block,
                                    &ir_builder_);
     for (int i = 0; i != num_reduces; ++i) {
-      ShapeIndex output_shape_index;
-      if (output->IsMultiOutputFusion()) {
-        output_shape_index = {i};
-      }
       llvm::Value* output_address =
-          GetIrArray(*output, *output, output_shape_index)
+          GetIrArray(*output, *output, reduce_output_shapes[i])
               .EmitArrayElementAddress(
                   llvm_ir::IrArray::Index(
                       y,
                       ShapeUtil::GetSubshape(output->shape(),
-                                             output_shape_index),
+                                             reduce_output_shapes[i]),
                       &ir_builder_),
                   &ir_builder_, "output_element_address");
       TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
@@ -1656,7 +1699,11 @@ Status IrEmitterUnnested::EmitReductionToVector(
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
     tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
-    tensorflow::gtl::ArraySlice<HloComputation*> reducers) {
+    tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+    tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+    tensorflow::gtl::ArraySlice<
+        std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+        extra_output_gens) {
   // This emission requires "reduce" to have an input layout. It is either set
   // by LayoutAssignment (for a top-level kReduce) or by InstructionFusion (for
   // a fused kReduce).
@@ -1692,7 +1739,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
   // dimension of the input is to keep.
   if (input_dims_to_keep.empty()) {
     return EmitReductionToScalar(reduce, input_shape, input_gens,
-                                 init_value_gens, reducers);
+                                 init_value_gens, reducers,
+                                 reduce_output_shapes, extra_output_gens);
   } else if (input_dims_to_keep.front() ==
              LayoutUtil::Minor(input_shape.layout(), 0)) {
     // Column reduction. Treat the result of "input" as a matrix whose width
@@ -1710,7 +1758,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
       }
     }
     return EmitColumnReduction(height, width, reduce, input_shape, input_gens,
-                               init_value_gens, reducers);
+                               init_value_gens, reducers, reduce_output_shapes,
+                               extra_output_gens);
   } else {
     // Reduce the row dimension of a matrix or reduce dimension 0 and 2 in a
     // 3D tensor. The size of dimension 1 (the height) is the size of the
@@ -1736,7 +1785,8 @@ Status IrEmitterUnnested::EmitReductionToVector(
     }
     const int64 height = ShapeUtil::ElementsIn(reduce->shape());
     return EmitRowReduction(depth, height, width, reduce, input_shape,
-                            input_gens, init_value_gens, reducers);
+                            input_gens, init_value_gens, reducers,
+                            reduce_output_shapes, extra_output_gens);
   }
 }
 
@@ -1768,7 +1818,7 @@ Status IrEmitterUnnested::HandleReduce(HloInstruction* reduce) {
           return GetIrArray(*init_value, *reduce)
               .EmitReadArrayElement(index, &ir_builder_);
         }},
-        dimensions_to_reduce, {reducer});
+        dimensions_to_reduce, {reducer}, {{}}, {});
   }
 
   thunk_sequence_->emplace_back(BuildKernelThunk(reduce));
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index b41eaa303b..202231b82f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -100,6 +100,13 @@ class IrEmitterUnnested : public IrEmitter {
       const HloInstruction& inst,
       tensorflow::gtl::ArraySlice<const BufferAllocation*> args);
 
+  // Helper for writing extra outputs from inside a reduce kernel.
+  Status EmitExtraOutputsForReduce(
+      const HloInstruction* reduce, const llvm_ir::IrArray::Index& index,
+      tensorflow::gtl::ArraySlice<
+          std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+          extra_output_gens);
+
   // EmitColumnReduction and EmitRowReduction emit code for column and row
   // reduction of a matrix and/or 3D tensor. Row and column reduction have
   // different memory access pattern, so for performance their implementations
@@ -115,7 +122,11 @@ class IrEmitterUnnested : public IrEmitter {
       const Shape& input_shape,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
-      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+      tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+      tensorflow::gtl::ArraySlice<
+          std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+          extra_output_gens);
 
   // Emits code that reduces a 3D tensor of shape [depth x height x width] to a
   // vector of shape [height]. Other parameters have the same meaning as those
@@ -127,14 +138,22 @@ class IrEmitterUnnested : public IrEmitter {
       const Shape& input_shape,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
-      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+      tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+      tensorflow::gtl::ArraySlice<
+          std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+          extra_output_gens);
 
   // Emits code that reduces a tensor of arbitrary rank to a scalar.
   Status EmitReductionToScalar(
       HloInstruction* reduce, const Shape& input_shape,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
-      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+      tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+      tensorflow::gtl::ArraySlice<
+          std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+          extra_output_gens);
 
   // Figures out whether `reduce` is a row or column reduction, and which
   // dimensions to reduce, and calls either `EmitRowReduction` or
@@ -147,13 +166,21 @@ class IrEmitterUnnested : public IrEmitter {
   // Multiple reduces can be emitted in the same loop, assuming they have the
   // same input and output shapes, and the same reduce dimensions.
   //
+  // extra_output_gens can contain extra generators for intermediate outputs.
+  // These must have the same shape as the reduce input as they are computed
+  // when the reduce inputs are being read.
+  //
   // Prerequisite: `IsReductionToVector(*reduce)`
   Status EmitReductionToVector(
       HloInstruction* reduce, const Shape& input_shape,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> input_gens,
       tensorflow::gtl::ArraySlice<llvm_ir::ElementGenerator> init_value_gens,
       tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
-      tensorflow::gtl::ArraySlice<HloComputation*> reducers);
+      tensorflow::gtl::ArraySlice<HloComputation*> reducers,
+      tensorflow::gtl::ArraySlice<ShapeIndex> reduce_output_shapes,
+      tensorflow::gtl::ArraySlice<
+          std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
+          extra_output_gens);
 
   // Returns a KernelThunk that invokes the kernel emitted for `inst`. The
   // caller needs to make sure `inst` outlives the lifetime of the returned
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 7bfc8eb546..f1d33a280d 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -380,5 +380,105 @@ XLA_TEST_F(MultiOutputFusionTest,
                                         Literal::CreateR1<float>({66, 138}))));
 }
 
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionMinorWithExtraOutput)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0, c0), dimensions={2}, to_apply=Add
+      mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
+      c1 = f32[] constant(5)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=Max
+      ROOT tuple = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0})
+                     tuple(p0, r1, r2)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT fusion = (f32[2,2,2]{2,1,0}, f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p),
+                                                 kind=kInput, calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result,
+      *Literal::MakeTupleOwned(
+          Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}}),
+          Literal::CreateR2<float>({{3, 7}, {11, 15}}),
+          Literal::CreateR2<float>({{5, 16}, {36, 64}}))));
+}
+
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionMajorWithExtraOutput)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0, c0), dimensions={0}, to_apply=Add
+      mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
+      c1 = f32[] constant(5)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={0}, to_apply=Max
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0})
+                     tuple(r1, mul, r2)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT fusion = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}, f32[2,2]{1,0}) fusion(p),
+                                                 kind=kInput, calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result,
+      *Literal::MakeTupleOwned(
+          Literal::CreateR2<float>({{6, 8}, {10, 12}}),
+          Literal::CreateR3<float>({{{1, 4}, {9, 16}}, {{25, 36}, {49, 64}}}),
+          Literal::CreateR2<float>({{25, 36}, {49, 64}}))));
+}
+
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionScalarWithExtraOutput)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      r1 = f32[2]{0} reduce(p0, c0), dimensions={0,2}, to_apply=Add
+      mul = f32[2,2,2]{2,1,0} multiply(p0, p0)
+      c1 = f32[] constant(5)
+      mul2 = f32[2,2,2]{2,1,0} multiply(p0, c1)
+      ROOT tuple = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0})
+                                                           tuple(r1, mul, mul2)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT fusion = (f32[2]{0}, f32[2,2,2]{2,1,0}, f32[2,2,2]{2,1,0}) fusion(p),
+                                                 kind=kInput, calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{1, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  TF_ASSERT_OK_AND_ASSIGN(auto result,
+                          Execute(std::move(module), {param.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result,
+      *Literal::MakeTupleOwned(
+          Literal::CreateR1<float>({14, 22}),
+          Literal::CreateR3<float>({{{1, 4}, {9, 16}}, {{25, 36}, {49, 64}}}),
+          Literal::CreateR3<float>(
+              {{{5, 10}, {15, 20}}, {{25, 30}, {35, 40}}}))));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 88ac13ac825f5eecb7082d5878605251a66b3012 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 6 Jun 2018 11:33:55 -0700
Subject: [PATCH 378/610] Rename some functions in
 MatrixMatrixBlockPanelEmitter; NFC

The previous function names are misleading.

PiperOrigin-RevId: 199499028
---
 .../xla/service/cpu/dot_op_emitter.cc         | 50 +++++++++----------
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index d77076546f..c5c95a3c2c 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -729,21 +729,17 @@ class MatrixMatrixBlockPanelEmitter {
   void Emit();
 
  private:
-  // This emits a loop that loops over the `n` dimension in multiples of
-  // `max_vectorization_width` as much as possible and then emits a remainder
-  // epilogue.
-  void EmitLoopOverN();
-
-  // This emits a loop that loops over the `k` dimension in multiples of
-  // `tile_size_k` as much as possible and then emits a remainder epilogue.
-  void EmitLoopOverK(VectorSupportLibrary* vsl, llvm::Value* n_start,
-                     llvm::Value* n_end);
-
-  // This emits a loop that loops over the `m` dimension in multiples of
-  // `tile_size_m` as much as possible and then emits a remainder epilogue.
-  void EmitLoopOverM(VectorSupportLibrary* vsl, int64 tile_size_k,
-                     llvm::Value* k_start, llvm::Value* k_end,
-                     llvm::Value* n_start, llvm::Value* n_end);
+  // The HandleResiduesOnX helpers split the iteration space for dimension X
+  // into a multiple of the tile size on dimension X and an epilogue.  These
+  // helpers ultimately call into `EmitTiledReductionLoop` for emitting the
+  // tiled GEMM kernel.
+
+  void HandleResiduesOnN();
+  void HandleResiduesOnK(VectorSupportLibrary* vsl, llvm::Value* n_start,
+                         llvm::Value* n_end);
+  void HandleResiduesOnM(VectorSupportLibrary* vsl, int64 tile_size_k,
+                         llvm::Value* k_start, llvm::Value* k_end,
+                         llvm::Value* n_start, llvm::Value* n_end);
 
   // This emits the inner reduction loop.  This inner reduction loop multiplies
   // a tile from the LHS of size [tile_size_m,tile_size_k] and a tile from the
@@ -779,9 +775,9 @@ class MatrixMatrixBlockPanelEmitter {
   KernelSupportLibrary ksl_;
 };
 
-void MatrixMatrixBlockPanelEmitter::Emit() { EmitLoopOverN(); }
+void MatrixMatrixBlockPanelEmitter::Emit() { HandleResiduesOnN(); }
 
-void MatrixMatrixBlockPanelEmitter::EmitLoopOverN() {
+void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
   // We can only iterate the `n` dimension for an extent that is divisible by
   // the vectorization width.  So we emit an outer loop that first processes the
   // largest extent in `n` that is divisible by max_vectorization_width, then
@@ -796,7 +792,7 @@ void MatrixMatrixBlockPanelEmitter::EmitLoopOverN() {
     if (n_start != n_end) {
       VectorSupportLibrary vsl(scalar_type(), current_vectorization_width,
                                ir_builder_, "gebp");
-      EmitLoopOverK(&vsl, GetInt64(n_start), GetInt64(n_end));
+      HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
       n_start = n_end;
     }
     current_vectorization_width /= 2;
@@ -807,29 +803,29 @@ void MatrixMatrixBlockPanelEmitter::EmitLoopOverN() {
     ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
       llvm::Value* n_i_next =
           ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
-      EmitLoopOverK(&vsl, n_i, n_i_next);
+      HandleResiduesOnK(&vsl, n_i, n_i_next);
     });
   }
 }
 
-void MatrixMatrixBlockPanelEmitter::EmitLoopOverK(VectorSupportLibrary* vsl,
-                                                  llvm::Value* n_start,
-                                                  llvm::Value* n_end) {
+void MatrixMatrixBlockPanelEmitter::HandleResiduesOnK(VectorSupportLibrary* vsl,
+                                                      llvm::Value* n_start,
+                                                      llvm::Value* n_end) {
   int64 k_start = 0;
   int64 k_end = dims().k() - (dims().k() % tile_size_k());
   if (k_end != k_start) {
-    EmitLoopOverM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
-                  n_start, n_end);
+    HandleResiduesOnM(vsl, tile_size_k(), GetInt64(k_start), GetInt64(k_end),
+                      n_start, n_end);
     k_start = k_end;
   }
 
   if (k_start != dims().k()) {
-    EmitLoopOverM(vsl, dims().k() - k_start, GetInt64(k_start),
-                  GetInt64(dims().k()), n_start, n_end);
+    HandleResiduesOnM(vsl, dims().k() - k_start, GetInt64(k_start),
+                      GetInt64(dims().k()), n_start, n_end);
   }
 }
 
-void MatrixMatrixBlockPanelEmitter::EmitLoopOverM(
+void MatrixMatrixBlockPanelEmitter::HandleResiduesOnM(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
   const int64 m_end = dims().m() - dims().m() % tile_size_m();
-- 
GitLab


From 01870cb183c524e3c0741bdb62c8ca84af93006e Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 6 Jun 2018 11:41:23 -0700
Subject: [PATCH 379/610] Fixing the setuptools issue for pip builds.

---
 tensorflow/tools/ci_build/builds/pip.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 5fa75e1d61..76210ba463 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -315,6 +315,7 @@ create_activate_virtualenv_and_install_tensorflow() {
   # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
   echo "Upgrade pip in virtualenv"
   pip install --upgrade pip==9.0.1
+  pip install --upgrade setuptools==39.1.0
 
   # Force tensorflow reinstallation. Otherwise it may not get installed from
   # last build if it had the same version number as previous build.
-- 
GitLab


From bbe49e75336ea2206a146a4d03614aaeca013079 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 11:42:25 -0700
Subject: [PATCH 380/610] Split out HloBatchNormInstruction as subclasses from
 HloInstruction.

PiperOrigin-RevId: 199500687
---
 tensorflow/compiler/xla/service/BUILD         |   6 +-
 .../compiler/xla/service/hlo_casting_utils.h  |   5 +-
 .../xla/service/hlo_casting_utils_test.cc     |   1 +
 .../compiler/xla/service/hlo_instruction.cc   | 146 +++++++++---------
 .../compiler/xla/service/hlo_instruction.h    |  51 +++---
 .../compiler/xla/service/hlo_instructions.cc  | 118 ++++++++++++++
 .../compiler/xla/service/hlo_instructions.h   | 107 +++++++++++++
 7 files changed, 330 insertions(+), 104 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/hlo_instructions.cc
 create mode 100644 tensorflow/compiler/xla/service/hlo_instructions.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 345f5ddeb2..20cc671ba3 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -269,6 +269,7 @@ cc_library(
         "dfs_hlo_visitor.cc",
         "hlo_computation.cc",
         "hlo_instruction.cc",
+        "hlo_instructions.cc",
         "hlo_module.cc",
         "hlo_opcode.cc",
         "hlo_sharding.cc",
@@ -280,11 +281,13 @@ cc_library(
         "hlo_computation.h",
         "hlo_domain_metadata.h",
         "hlo_instruction.h",
+        "hlo_instructions.h",
         "hlo_module.h",
         "hlo_opcode.h",
         "hlo_sharding.h",
     ],
     deps = [
+        ":hlo_casting_utils",
         ":hlo_module_config",
         ":hlo_proto",
         ":hlo_reachability",
@@ -3015,13 +3018,14 @@ cc_library(
 cc_library(
     name = "hlo_casting_utils",
     hdrs = ["hlo_casting_utils.h"],
-    deps = [":hlo"],
+    deps = ["//tensorflow/core:lib"],
 )
 
 tf_cc_test(
     name = "hlo_casting_utils_test",
     srcs = ["hlo_casting_utils_test.cc"],
     deps = [
+        ":hlo",
         ":hlo_casting_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:test",
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils.h b/tensorflow/compiler/xla/service/hlo_casting_utils.h
index b15f1f24c6..7f73bba036 100644
--- a/tensorflow/compiler/xla/service/hlo_casting_utils.h
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils.h
@@ -18,10 +18,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_CASTING_UTILS_H_
 
-#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include <type_traits>
+#include "tensorflow/core/platform/logging.h"
 
 namespace xla {
 
+class HloInstruction;
+
 template <class T>
 using EnableIfDerivedFromHlo =
     typename std::enable_if<std::is_base_of<HloInstruction, T>::value>::type;
diff --git a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
index 436a922234..a336427540 100644
--- a/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_casting_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 06775d6a9a..8d7604fae1 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/protobuf_util.h"
 #include "tensorflow/compiler/xla/ptr_util.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -60,17 +62,45 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(proto.opcode()));
   TF_RET_CHECK(proto.has_shape());
 
-  auto instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
-  for (const int64 operand_id : proto.operand_ids()) {
-    TF_RET_CHECK(ContainsKey(instruction_map, operand_id))
-        << "No instruction with id " << operand_id;
-    instruction->AppendOperand(instruction_map.at(operand_id));
-  }
-  for (const int64 predecessor_id : proto.control_predecessor_ids()) {
-    TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id))
-        << "No instruction with id " << predecessor_id;
-    TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
-                           ->AddControlDependencyTo(instruction.get()));
+  std::unique_ptr<HloInstruction> instruction;
+  const auto operands = [&instruction_map, &proto](int index) {
+    return instruction_map.at(proto.operand_ids(index));
+  };
+  switch (opcode) {
+    // Ops migrated to subclasses.
+    case HloOpcode::kBatchNormTraining:
+      CHECK_EQ(proto.operand_ids_size(), 3);
+      instruction = CreateBatchNormTraining(
+          proto.shape(), operands(0), operands(1), operands(2), proto.epsilon(),
+          proto.feature_index());
+      break;
+    case HloOpcode::kBatchNormInference:
+      CHECK_EQ(proto.operand_ids_size(), 5);
+      instruction = CreateBatchNormInference(
+          proto.shape(), operands(0), operands(1), operands(2), operands(3),
+          operands(4), proto.epsilon(), proto.feature_index());
+      break;
+    case HloOpcode::kBatchNormGrad:
+      CHECK_EQ(proto.operand_ids_size(), 5);
+      instruction = CreateBatchNormGrad(proto.shape(), operands(0), operands(1),
+                                        operands(2), operands(3), operands(4),
+                                        proto.epsilon(), proto.feature_index());
+      break;
+    default: {
+      instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
+      for (const int64 operand_id : proto.operand_ids()) {
+        TF_RET_CHECK(ContainsKey(instruction_map, operand_id))
+            << "No instruction with id " << operand_id;
+        instruction->AppendOperand(instruction_map.at(operand_id));
+      }
+      for (const int64 predecessor_id : proto.control_predecessor_ids()) {
+        TF_RET_CHECK(ContainsKey(instruction_map, predecessor_id))
+            << "No instruction with id " << predecessor_id;
+        TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
+                               ->AddControlDependencyTo(instruction.get()));
+      }
+      break;
+    }
   }
 
   // In the proto, fused computations are held exclusively within the
@@ -151,8 +181,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   }
   instruction->outfeed_config_ = proto.outfeed_config();
   instruction->distribution_ = proto.distribution();
-  instruction->epsilon_ = proto.epsilon();
-  instruction->feature_index_ = proto.feature_index();
   instruction->channel_id_ = proto.channel_id();
   instruction->infeed_config_ = proto.infeed_config();
   instruction->custom_call_target_ = proto.custom_call_target();
@@ -646,14 +674,8 @@ HloInstruction::CreateBatchNormTraining(const Shape& shape,
                                         HloInstruction* scale,
                                         HloInstruction* offset, float epsilon,
                                         int64 feature_index) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBatchNormTraining, shape));
-  instruction->AppendOperand(operand);
-  instruction->AppendOperand(scale);
-  instruction->AppendOperand(offset);
-  instruction->epsilon_ = epsilon;
-  instruction->feature_index_ = feature_index;
-  return instruction;
+  return WrapUnique<HloInstruction>(new HloBatchNormTrainingInstruction(
+      shape, operand, scale, offset, epsilon, feature_index));
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -661,16 +683,8 @@ HloInstruction::CreateBatchNormInference(
     const Shape& shape, HloInstruction* operand, HloInstruction* scale,
     HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
     float epsilon, int64 feature_index) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBatchNormInference, shape));
-  instruction->AppendOperand(operand);
-  instruction->AppendOperand(scale);
-  instruction->AppendOperand(offset);
-  instruction->AppendOperand(mean);
-  instruction->AppendOperand(variance);
-  instruction->epsilon_ = epsilon;
-  instruction->feature_index_ = feature_index;
-  return instruction;
+  return WrapUnique<HloInstruction>(new HloBatchNormInferenceInstruction(
+      shape, operand, scale, offset, mean, variance, epsilon, feature_index));
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -679,16 +693,9 @@ HloInstruction::CreateBatchNormGrad(const Shape& shape, HloInstruction* operand,
                                     HloInstruction* variance,
                                     HloInstruction* grad_output, float epsilon,
                                     int64 feature_index) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBatchNormGrad, shape));
-  instruction->AppendOperand(operand);
-  instruction->AppendOperand(scale);
-  instruction->AppendOperand(mean);
-  instruction->AppendOperand(variance);
-  instruction->AppendOperand(grad_output);
-  instruction->epsilon_ = epsilon;
-  instruction->feature_index_ = feature_index;
-  return instruction;
+  return WrapUnique<HloInstruction>(
+      new HloBatchNormGradInstruction(shape, operand, scale, mean, variance,
+                                      grad_output, epsilon, feature_index));
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1275,6 +1282,13 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
   // in the face of code changes than copying fields explicitly. This also
   // properly sets the user fields of the operands.
   switch (opcode_) {
+    // Ops migrated to subclasses.
+    // TODO(b/80131774): Remove this switch when migration is complete.
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormGrad:
+      clone = CloneWithNewOperandsImpl(shape, new_operands, context);
+      break;
     // Unary ops.
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
@@ -1476,18 +1490,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kParameter:
       clone = CreateParameter(parameter_number_, shape, name_);
       break;
-    case HloOpcode::kBatchNormTraining:
-      CHECK_EQ(new_operands.size(), 3);
-      clone =
-          CreateBatchNormTraining(shape, new_operands[0], new_operands[1],
-                                  new_operands[2], epsilon(), feature_index());
-      break;
-    case HloOpcode::kBatchNormInference:
-      CHECK_EQ(new_operands.size(), 5);
-      clone = CreateBatchNormInference(
-          shape, new_operands[0], new_operands[1], new_operands[2],
-          new_operands[3], new_operands[4], epsilon(), feature_index());
-      break;
     case HloOpcode::kInfeed:
       CHECK_EQ(new_operands.size(), 0);
       clone = CreateInfeed(shape, infeed_config());
@@ -1496,12 +1498,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateOutfeed(outfeed_shape_, new_operands[0], outfeed_config());
       break;
-    case HloOpcode::kBatchNormGrad:
-      CHECK_EQ(new_operands.size(), 5);
-      clone = CreateBatchNormGrad(shape, new_operands[0], new_operands[1],
-                                  new_operands[2], new_operands[3],
-                                  new_operands[4], epsilon(), feature_index());
-      break;
     case HloOpcode::kConditional:
       CHECK_EQ(new_operands.size(), 3);
       clone = CreateConditional(shape, new_operands[0], new_operands[1],
@@ -1834,12 +1830,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kParameter:
       return parameter_number() == other.parameter_number();
 
-    case HloOpcode::kBatchNormTraining:
-    case HloOpcode::kBatchNormInference:
-    case HloOpcode::kBatchNormGrad:
-      return feature_index() == other.feature_index() &&
-             epsilon() == other.epsilon();
-
     // A constant is defined by the value in the literal.
     case HloOpcode::kConstant:
       return literal() == other.literal();
@@ -1886,7 +1876,6 @@ bool HloInstruction::IdenticalSlowPath(
              eq_computations(scatter(), other.scatter()) &&
              protobuf_util::ProtobufEquals(window(), other.window());
 
-
     // Remaining instructions with special values.
     case HloOpcode::kGetTupleElement:
       return tuple_index() == other.tuple_index();
@@ -1932,6 +1921,14 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kSendDone:
     case HloOpcode::kHostCompute:
       return false;
+
+    // Ops migrated to subclasses should never come to this line.
+    // TODO(b/80131774): Remove this switch when migration is complete.
+    case HloOpcode::kBatchNormTraining:
+    case HloOpcode::kBatchNormInference:
+    case HloOpcode::kBatchNormGrad:
+      LOG(FATAL) << "Base class impl called for opcode with subclass: "
+                 << opcode();
   }
 }
 
@@ -2326,12 +2323,6 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(
         StrCat("dynamic_slice_sizes={", Join(dynamic_slice_sizes(), ","), "}"));
   }
-  if (opcode() == HloOpcode::kBatchNormTraining ||
-      opcode() == HloOpcode::kBatchNormInference ||
-      opcode() == HloOpcode::kBatchNormGrad) {
-    extra.push_back(StrCat("epsilon=", epsilon()));
-    extra.push_back(StrCat("feature_index=", feature_index()));
-  }
 
   if (convolution_dimension_numbers_ != nullptr) {
     extra.push_back(StrCat(
@@ -2552,8 +2543,6 @@ HloInstructionProto HloInstruction::ToProto() const {
   if (opcode() == HloOpcode::kRng) {
     proto.set_distribution(distribution_);
   }
-  proto.set_epsilon(epsilon_);
-  proto.set_feature_index(feature_index_);
   proto.set_channel_id(channel_id_);
   proto.set_infeed_config(infeed_config_);
   proto.set_custom_call_target(custom_call_target_);
@@ -3619,4 +3608,13 @@ void HloInstruction::RelayoutConstant(const Layout& new_layout,
   }
 }
 
+// TODO(b/80131774): Remove these temporary methods after transition.
+int64 HloInstruction::feature_index() const {
+  return Cast<HloBatchNormInstruction>(this)->feature_index();
+}
+
+float HloInstruction::epsilon() const {
+  return Cast<HloBatchNormInstruction>(this)->epsilon();
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index ef55c6668f..b16837eaec 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -992,14 +992,14 @@ class HloInstruction {
   string OperandsToString(const HloPrintOptions& options) const;
 
   // Returns string representation of op-specific attributes.
-  std::vector<string> ExtraAttributesToString(
+  virtual std::vector<string> ExtraAttributesToString(
       const HloPrintOptions& options) const;
 
   // As ToString, but returns a shorter string.
   string ToShortString() const;
 
   // Returns a serialized representation of this instruction.
-  HloInstructionProto ToProto() const;
+  virtual HloInstructionProto ToProto() const;
 
   // Returns a category for the HLO. This could be something like "convolution"
   // or "elementwise".
@@ -1024,19 +1024,13 @@ class HloInstruction {
   // Precondition: opcode() == HloOpcode::kHostCompute
   string channel_name() const { return channel_name_; }
 
-  // Returns feature_index field associated with the instruction. The index
-  // represents the index of the feature dimension.
-  //
-  // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference,
-  // or kBatchNormGrad.
-  int64 feature_index() const { return feature_index_; }
+  // Delegates to HloBatchNormInstruction::feature_index.
+  // TODO(b/80131774): Remove this code.
+  int64 feature_index() const;
 
-  // Returns a epsilon value associated with the instruction. The is a small
-  // number added to the variance to avoid divide-by-zero error.
-  //
-  // Precondition: opcode() is one of kBatchNormTraining, kBatchNormInference,
-  // or kBatchNormGrad.
-  float epsilon() const { return epsilon_; }
+  // Delegates to HloBatchNormInstruction::epsilon.
+  // TODO(b/80131774): Remove this code.
+  float epsilon() const;
 
   // Returns the infeed configuration string. The infeed configuration includes
   // any metadata needed for the backend compiler (e.g., infeed buffer address)
@@ -1371,7 +1365,8 @@ class HloInstruction {
 
   // Clones the HLO instruction as above but with new shape and operands.
   std::unique_ptr<HloInstruction> CloneWithNewOperands(
-      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
       HloCloneContext* context = nullptr) const;
 
   // Returns the computations this instruction directly calls (if any).
@@ -1536,7 +1531,19 @@ class HloInstruction {
   // by factory methods.
   HloInstruction(HloOpcode opcode, const Shape& shape);
 
+  // Appends operand to the list of operands and adds this instruction as a user
+  // of the operand.
+  void AppendOperand(HloInstruction* operand);
+
  private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  virtual std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const {
+    // TODO(b/80131774): This should be pure virtual.
+    LOG(FATAL) << "Unimplemented method.";
+  }
   // Prints an instruction to a string.
   //
   // The canonical string representation needs to name operands and instruction
@@ -1561,7 +1568,7 @@ class HloInstruction {
   class FusionReusesParamElements;
 
   // See comments on Identical().
-  bool IdenticalSlowPath(
+  virtual bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const;
@@ -1571,10 +1578,6 @@ class HloInstruction {
       const Shape& shape, HloOpcode opcode,
       tensorflow::gtl::ArraySlice<HloInstruction*> operands);
 
-  // Appends operand to the list of operands and adds this instruction as a user
-  // of the operand.
-  void AppendOperand(HloInstruction* operand);
-
   // Adds a user for this instruction.
   void AddUser(HloInstruction* user);
 
@@ -1752,14 +1755,6 @@ class HloInstruction {
   // Only present for kRng.
   RandomDistribution distribution_;
 
-  // A small float number added to the variance to avoid divide-by-zero error.
-  // Only present for kBatchNormTraining.
-  float epsilon_ = 0.0f;
-
-  // An integer value representing the index of the feature dimension.
-  // Only present for kBatchNormTraining.
-  int64 feature_index_ = -1;
-
   // Represents a unique identifier for each Send/Recv instruction pair.
   // Only present for kSend or kRecv.
   int64 channel_id_ = -1;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
new file mode 100644
index 0000000000..adbebb135b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
+
+namespace xla {
+
+using ::tensorflow::strings::StrCat;
+
+HloBatchNormInstruction::HloBatchNormInstruction(
+    HloOpcode opcode, const Shape& shape, HloInstruction* operand,
+    HloInstruction* scale, float epsilon, int64 feature_index)
+    : HloInstruction(opcode, shape),
+      epsilon_(epsilon),
+      feature_index_(feature_index) {
+  AppendOperand(operand);
+  AppendOperand(scale);
+}
+
+bool HloBatchNormInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloBatchNormInstruction&>(other);
+  return feature_index() == casted_other.feature_index() &&
+         epsilon() == casted_other.epsilon();
+}
+
+std::vector<string> HloBatchNormInstruction::ExtraAttributesToString(
+    const HloPrintOptions& options) const {
+  std::vector<string> extra = {StrCat("epsilon=", epsilon()),
+                               StrCat("feature_index=", feature_index())};
+  return extra;
+}
+
+HloInstructionProto HloBatchNormInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_epsilon(epsilon_);
+  proto.set_feature_index(feature_index_);
+  return proto;
+}
+
+HloBatchNormTrainingInstruction::HloBatchNormTrainingInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+    HloInstruction* offset, float epsilon, int64 feature_index)
+    : HloBatchNormInstruction(HloOpcode::kBatchNormTraining, shape, operand,
+                              scale, epsilon, feature_index) {
+  AppendOperand(offset);
+}
+
+std::unique_ptr<HloInstruction>
+HloBatchNormTrainingInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 3);
+  return MakeUnique<HloBatchNormTrainingInstruction>(
+      shape, new_operands[0], new_operands[1], new_operands[2], epsilon(),
+      feature_index());
+}
+
+HloBatchNormInferenceInstruction::HloBatchNormInferenceInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+    HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
+    float epsilon, int64 feature_index)
+    : HloBatchNormInstruction(HloOpcode::kBatchNormInference, shape, operand,
+                              scale, epsilon, feature_index) {
+  AppendOperand(offset);
+  AppendOperand(mean);
+  AppendOperand(variance);
+}
+
+std::unique_ptr<HloInstruction>
+HloBatchNormInferenceInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 5);
+  return MakeUnique<HloBatchNormInferenceInstruction>(
+      shape, new_operands[0], new_operands[1], new_operands[2], new_operands[3],
+      new_operands[4], epsilon(), feature_index());
+}
+
+HloBatchNormGradInstruction::HloBatchNormGradInstruction(
+    const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+    HloInstruction* mean, HloInstruction* variance, HloInstruction* grad_output,
+    float epsilon, int64 feature_index)
+    : HloBatchNormInstruction(HloOpcode::kBatchNormGrad, shape, operand, scale,
+                              epsilon, feature_index) {
+  AppendOperand(mean);
+  AppendOperand(variance);
+  AppendOperand(grad_output);
+}
+
+std::unique_ptr<HloInstruction>
+HloBatchNormGradInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 5);
+  return MakeUnique<HloBatchNormGradInstruction>(
+      shape, new_operands[0], new_operands[1], new_operands[2], new_operands[3],
+      new_operands[4], epsilon(), feature_index());
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
new file mode 100644
index 0000000000..6fcd96a8c6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// All HloInstruction subclasses are put in this file.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+
+namespace xla {
+
+class HloBatchNormInstruction : public HloInstruction {
+ public:
+  // Returns feature_index field associated with the instruction. The index
+  // represents the index of the feature dimension.
+  int64 feature_index() const { return feature_index_; }
+
+  // Returns a epsilon value associated with the instruction. The is a small
+  // number added to the variance to avoid divide-by-zero error.
+  float epsilon() const { return epsilon_; }
+
+  // Returns string representation of op-specific attributes.
+  std::vector<string> ExtraAttributesToString(
+      const HloPrintOptions& options) const override;
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ protected:
+  HloBatchNormInstruction(HloOpcode opcode, const Shape& shape,
+                          HloInstruction* operand, HloInstruction* scale,
+                          float epsilon, int64 feature_index);
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // A small float number added to the variance to avoid divide-by-zero error.
+  float epsilon_ = 0.0f;
+
+  // An integer value representing the index of the feature dimension.
+  int64 feature_index_ = -1;
+};
+
+class HloBatchNormTrainingInstruction : public HloBatchNormInstruction {
+ public:
+  HloBatchNormTrainingInstruction(const Shape& shape, HloInstruction* operand,
+                                  HloInstruction* scale, HloInstruction* offset,
+                                  float epsilon, int64 feature_index);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloBatchNormInferenceInstruction : public HloBatchNormInstruction {
+ public:
+  HloBatchNormInferenceInstruction(const Shape& shape, HloInstruction* operand,
+                                   HloInstruction* scale,
+                                   HloInstruction* offset, HloInstruction* mean,
+                                   HloInstruction* variance, float epsilon,
+                                   int64 feature_index);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloBatchNormGradInstruction : public HloBatchNormInstruction {
+ public:
+  HloBatchNormGradInstruction(const Shape& shape, HloInstruction* operand,
+                              HloInstruction* scale, HloInstruction* mean,
+                              HloInstruction* variance,
+                              HloInstruction* grad_output, float epsilon,
+                              int64 feature_index);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
-- 
GitLab


From 57c68dd580ee605cec0ce9d804ce257120485d50 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 6 Jun 2018 11:56:32 -0700
Subject: [PATCH 381/610] Limit number of entries in the cache.

At times the memory usage is high due to the usage of creating a new Namedtuple
type within some loop.

PiperOrigin-RevId: 199503489
---
 tensorflow/python/util/util.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 0dd406aa4e..c79d8a8445 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -33,6 +33,8 @@ namespace {
 PyObject* CollectionsSequenceType = nullptr;
 PyTypeObject* SparseTensorValueType = nullptr;
 
+const int kMaxItemsInCache = 1024;
+
 bool WarnedThatSetIsNotSequence = false;
 
 bool IsString(PyObject* o) {
@@ -196,11 +198,14 @@ int IsSequenceHelper(PyObject* o) {
   // NOTE: This is never decref'd, but we don't want the type to get deleted
   // as long as it is in the map. This should not be too much of a
   // leak, as there should only be a relatively small number of types in the
-  // map, and an even smaller number that are eligible for decref.
-  Py_INCREF(type);
+  // map, and an even smaller number that are eligible for decref. As a
+  // precaution, we limit the size of the map to 1024.
   {
     mutex_lock l(g_type_to_sequence_map);
-    type_to_sequence_map->insert({type, is_sequence});
+    if (type_to_sequence_map->size() < kMaxItemsInCache) {
+      Py_INCREF(type);
+      type_to_sequence_map->insert({type, is_sequence});
+    }
   }
 
   return is_sequence;
-- 
GitLab


From 20d3228e4efbf55441bf179e668ed52e900dd347 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Wed, 6 Jun 2018 11:56:49 -0700
Subject: [PATCH 382/610] Fix URLs in security/index.md and point SECURITY.md's
 vuln list to security/index.md

PiperOrigin-RevId: 199503532
---
 SECURITY.md                  | 11 +++--------
 tensorflow/security/index.md |  4 ++--
 2 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/SECURITY.md b/SECURITY.md
index 0a4be37cbc..e2f6ff353a 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -242,12 +242,7 @@ v//Fw6ZeY+HmRDFdirjD7wXtIuER4vqCryIqR6Xe9X8oJXz9L/Jhslc=
 -----END PGP PUBLIC KEY BLOCK-----
 ```
 
-### Known vulnerabilities
-
-| Type               | Versions affected | Reported by           | Additional Information      |
-|--------------------|:-----------------:|-----------------------|-----------------------------|
-| TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-003.md) |
-| GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-002.md) |
-| BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent | [security advisory](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/docs_src/security/advisory/tfsa-2018-001.md) |
-| Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+### Known Vulnerabilities
 
+For a list of known vulnerabilities and security advisories for TensorFlow,
+(https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/index.md)[click here].
diff --git a/tensorflow/security/index.md b/tensorflow/security/index.md
index 44f51ad07b..ea39e17ab2 100644
--- a/tensorflow/security/index.md
+++ b/tensorflow/security/index.md
@@ -4,7 +4,7 @@ We regularly publish security advisories about using TensorFlow.
 
 *Note*: In conjunction with these security advisories, we strongly encourage
 TensorFlow users to read and understand TensorFlow's security model as outlined
-in [https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](SECURITY.md).
+in (https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md)[SECURITY.md].
 
 | Advisory Number | Type               | Versions affected | Reported by           | Additional Information      |
 |-----------------|--------------------|:-----------------:|-----------------------|-----------------------------|
@@ -14,5 +14,5 @@ in [https://github.com/tensorflow/tensorflow/blob/master/SECURITY.md](SECURITY.m
 | [TFSA-2018-003](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-003.md)   | TensorFlow Lite TOCO FlatBuffer Parsing Vulnerability | <= 1.7 | Blade Team of Tencent |  |
 | [TFSA-2018-002](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-002.md)   | GIF File Parsing Null Pointer Dereference Error | <= 1.5 | Blade Team of Tencent |  |
 | [TFSA-2018-001](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/security/advisory/tfsa-2018-001.md)   | BMP File Parser Out-of-bounds Read | <= 1.6 | Blade Team of Tencent |  |
-| -               | Out Of Bounds Read |             <=1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
+| -               | Out Of Bounds Read |             <= 1.4 | Blade Team of Tencent | [issue report](https://github.com/tensorflow/tensorflow/issues/14959) |
 
-- 
GitLab


From 51f0ff15e20ac5c966aa0e413771a242ba739185 Mon Sep 17 00:00:00 2001
From: Younghee Kwon <youngheek@google.com>
Date: Wed, 6 Jun 2018 12:08:46 -0700
Subject: [PATCH 383/610] boosted_trees: follow up on previous double precision
 commit. Using temporary tensor instead of a vector. PiperOrigin-RevId:
 199506102

---
 .../core/kernels/boosted_trees/stats_ops.cc   | 54 ++++++++-----------
 1 file changed, 21 insertions(+), 33 deletions(-)

diff --git a/tensorflow/core/kernels/boosted_trees/stats_ops.cc b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
index 53bdd482cb..48afd3fbf3 100644
--- a/tensorflow/core/kernels/boosted_trees/stats_ops.cc
+++ b/tensorflow/core/kernels/boosted_trees/stats_ops.cc
@@ -255,7 +255,7 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
     // node_ids
     const Tensor* node_ids_t;
     OP_REQUIRES_OK(context, context->input("node_ids", &node_ids_t));
-    const auto node_ids = node_ids_t->flat<int32>();
+    const auto node_ids = node_ids_t->vec<int32>();
     // gradients
     const Tensor* gradients_t;
     OP_REQUIRES_OK(context, context->input("gradients", &gradients_t));
@@ -270,46 +270,34 @@ class BoostedTreesMakeStatsSummaryOp : public OpKernel {
                                                 &bucketized_features_list));
     // Infer batch size.
     const int64 batch_size = node_ids_t->dim_size(0);
-    // Allocate output stats tensor (Rank 4).
-    Tensor* output_stats_summary_t = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(
-                                "stats_summary",
-                                {num_features_, max_splits_, num_buckets_, 2},
-                                &output_stats_summary_t));
-    auto output_stats_summary = output_stats_summary_t->flat<float>();
-    EIGEN_STATIC_ASSERT(
-        (static_cast<int>(decltype(output_stats_summary)::Layout) ==
-         static_cast<int>(Eigen::RowMajor)),
-        THIS_METHOD_IS_ONLY_FOR_ROW_MAJOR_MATRICES);
 
-    const int shift_per_node = num_buckets_ * 2;
-    const int shift_per_feature = shift_per_node * max_splits_;
-    const int32 max_index = num_features_ * shift_per_feature;
-    // We use double to sum the gradients and hessians, due to possible
-    // precision loss when summing small float values.
-    std::vector<double> res(max_index, 0);
+    // Allocate temporary stats tensor (Rank 4).
+    Tensor temp_stats_double_t;
+    OP_REQUIRES_OK(context, context->allocate_temp(
+                                DT_DOUBLE,
+                                {num_features_, max_splits_, num_buckets_, 2},
+                                &temp_stats_double_t));
+    auto temp_stats_double = temp_stats_double_t.tensor<double, 4>();
+    temp_stats_double.setZero();
 
     // Partition by node, and then bucketize.
-    int feature_idx = 0;
-    int feature_shift = 0;
-    for (const Tensor& tensor : bucketized_features_list) {
-      const auto& features = tensor.flat<int32>();
+    for (int feature_idx = 0; feature_idx < num_features_; ++feature_idx) {
+      const auto& features = bucketized_features_list[feature_idx].vec<int32>();
       for (int i = 0; i < batch_size; ++i) {
         const int32 node = node_ids(i);
         const int32 bucket = features(i);
-        // Calculate the index in the flattened vector for
-        // [feature_idx][node][bucket][0].
-        const int index = feature_shift + node * shift_per_node + bucket * 2;
-        res[index] += gradients(i, 0);
-        res[index + 1] += hessians(i, 0);
+        temp_stats_double(feature_idx, node, bucket, 0) += gradients(i, 0);
+        temp_stats_double(feature_idx, node, bucket, 1) += hessians(i, 0);
       }
-      ++feature_idx;
-      feature_shift += shift_per_feature;
-    }
-    // Copy over the results.
-    for (int i = 0; i < max_index; ++i) {
-      output_stats_summary(i) = res[i];
     }
+
+    // Copy temp tensor over to output tensor.
+    Tensor* output_stats_summary_t = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(
+                                "stats_summary", temp_stats_double_t.shape(),
+                                &output_stats_summary_t));
+    output_stats_summary_t->tensor<float, 4>() =
+        temp_stats_double.template cast<float>();
   }
 
  private:
-- 
GitLab


From ae2a2ae21b5398616c591d3b01778c6651cecb56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 12:31:05 -0700
Subject: [PATCH 384/610] enhance Tensorflow GBDT and GBRT model by exposing a
 new two dimensional output in prediction ops (example id, tree leaf node
 index id) for input as other model features

PiperOrigin-RevId: 199510127
---
 .../estimator_batch/estimator.py              |  40 +++++-
 .../estimator_batch/estimator_test.py         |  22 ++++
 .../boosted_trees/estimator_batch/model.py    |   8 +-
 .../boosted_trees/kernels/prediction_ops.cc   |  54 ++++++--
 .../lib/models/multiple_additive_trees.cc     |  14 ++-
 .../lib/models/multiple_additive_trees.h      |   7 +-
 .../models/multiple_additive_trees_test.cc    |  48 +++++--
 .../boosted_trees/ops/prediction_ops.cc       |  70 +++++++++++
 .../python/ops/prediction_ops.py              |   1 +
 .../python/training/functions/gbdt_batch.py   |  87 +++++++++----
 .../training/functions/gbdt_batch_test.py     | 117 +++++++++++++++++-
 11 files changed, 410 insertions(+), 58 deletions(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 89d0d611d2..9c36c30221 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -41,7 +41,8 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
                feature_engineering_fn=None,
                logits_modifier_function=None,
                center_bias=True,
-               use_core_libs=False):
+               use_core_libs=False,
+               output_leaf_index=False):
     """Initializes a GradientBoostedDecisionTreeClassifier estimator instance.
 
     Args:
@@ -66,6 +67,16 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
         the bias.
       use_core_libs: Whether feature columns and loss are from the core (as
         opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. It is a Tensor of rank 2 and its shape is
+        [batch_size, num_trees].
+        For example,
+        result_iter = classifier.predict(...)
+        for result_dict in result_iter:
+          # access leaf index list by result_dict["leaf_index"]
+          # which contains one leaf index per tree
+
     Raises:
       ValueError: If learner_config is not valid.
     """
@@ -74,7 +85,9 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
       # supports second order derivative.
       def loss_fn(labels, logits, weights=None):
         result = losses.per_example_maxent_loss(
-            labels=labels, logits=logits, weights=weights,
+            labels=labels,
+            logits=logits,
+            weights=weights,
             num_classes=n_classes)
         return math_ops.reduce_mean(result[0])
     else:
@@ -102,6 +115,7 @@ class GradientBoostedDecisionTreeClassifier(estimator.Estimator):
             'center_bias': center_bias,
             'logits_modifier_function': logits_modifier_function,
             'use_core_libs': use_core_libs,
+            'output_leaf_index': output_leaf_index,
         },
         model_dir=model_dir,
         config=config,
@@ -124,7 +138,8 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
                feature_engineering_fn=None,
                logits_modifier_function=None,
                center_bias=True,
-               use_core_libs=False):
+               use_core_libs=False,
+               output_leaf_index=False):
     """Initializes a GradientBoostedDecisionTreeRegressor estimator instance.
 
     Args:
@@ -151,6 +166,13 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
         the bias.
       use_core_libs: Whether feature columns and loss are from the core (as
         opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
     """
     head = head_lib.regression_head(
         label_name=label_name,
@@ -173,6 +195,7 @@ class GradientBoostedDecisionTreeRegressor(estimator.Estimator):
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
             'use_core_libs': use_core_libs,
+            'output_leaf_index': False,
         },
         model_dir=model_dir,
         config=config,
@@ -197,7 +220,8 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
                feature_engineering_fn=None,
                logits_modifier_function=None,
                center_bias=True,
-               use_core_libs=False):
+               use_core_libs=False,
+               output_leaf_index=False):
     """Initializes a GradientBoostedDecisionTreeEstimator estimator instance.
 
     Args:
@@ -220,6 +244,13 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
         the bias.
       use_core_libs: Whether feature columns and loss are from the core (as
         opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
     """
     super(GradientBoostedDecisionTreeEstimator, self).__init__(
         model_fn=model.model_builder,
@@ -233,6 +264,7 @@ class GradientBoostedDecisionTreeEstimator(estimator.Estimator):
             'logits_modifier_function': logits_modifier_function,
             'center_bias': center_bias,
             'use_core_libs': use_core_libs,
+            'output_leaf_index': False,
         },
         model_dir=model_dir,
         config=config,
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index 0d58317bd5..75ef1b0500 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -68,6 +68,28 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     classifier.evaluate(input_fn=_eval_input_fn, steps=1)
     classifier.export(self._export_dir_base)
 
+  def testThatLeafIndexIsInPredictions(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 1
+    model_dir = tempfile.mkdtemp()
+    config = run_config.RunConfig()
+
+    classifier = estimator.GradientBoostedDecisionTreeClassifier(
+        learner_config=learner_config,
+        num_trees=1,
+        examples_per_layer=3,
+        model_dir=model_dir,
+        config=config,
+        feature_columns=[contrib_feature_column.real_valued_column("x")],
+        output_leaf_index=True)
+
+    classifier.fit(input_fn=_train_input_fn, steps=15)
+    result_iter = classifier.predict(input_fn=_eval_input_fn)
+    for prediction_dict in result_iter:
+      self.assertTrue("leaf_index" in prediction_dict)
+      self.assertTrue("logits" in prediction_dict)
+
   def testFitAndEvaluateDontThrowExceptionWithCoreForEstimator(self):
     learner_config = learner_pb2.LearnerConfig()
     learner_config.num_classes = 2
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/model.py b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
index 15ab6d8145..1ee8911989 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/model.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/model.py
@@ -63,6 +63,8 @@ def model_builder(features, labels, mode, params, config):
   num_trees = params["num_trees"]
   use_core_libs = params["use_core_libs"]
   logits_modifier_function = params["logits_modifier_function"]
+  output_leaf_index = params["output_leaf_index"]
+
   if features is None:
     raise ValueError("At least one feature must be specified.")
 
@@ -96,7 +98,8 @@ def model_builder(features, labels, mode, params, config):
       feature_columns=feature_columns,
       logits_dimension=head.logits_dimension,
       features=training_features,
-      use_core_columns=use_core_libs)
+      use_core_columns=use_core_libs,
+      output_leaf_index=output_leaf_index)
   with ops.name_scope("gbdt", "gbdt_optimizer"):
     predictions_dict = gbdt_model.predict(mode)
     logits = predictions_dict["predictions"]
@@ -127,6 +130,9 @@ def model_builder(features, labels, mode, params, config):
         labels=labels,
         train_op_fn=_train_op_fn,
         logits=logits)
+  if output_leaf_index and gbdt_batch.LEAF_INDEX in predictions_dict:
+    model_fn_ops.predictions[gbdt_batch.LEAF_INDEX] = predictions_dict[
+        gbdt_batch.LEAF_INDEX]
   if num_trees:
     if center_bias:
       num_trees += 1
diff --git a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
index b3fe38614e..9493c1a139 100644
--- a/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/prediction_ops.cc
@@ -59,6 +59,7 @@ const char* kApplyDropoutAttributeName = "apply_dropout";
 const char* kApplyAveragingAttributeName = "apply_averaging";
 const char* kDropoutInfoOutputTensorName = "drop_out_tree_indices_weights";
 const char* kPredictionsTensorName = "predictions";
+const char* kLeafIndexTensorName = "leaf_index";
 
 void CalculateTreesToInclude(
     const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
@@ -170,15 +171,22 @@ class GradientTreesPredictionOp : public OpKernel {
     core::ScopedUnref unref_me(ensemble_resource);
     if (use_locking_) {
       tf_shared_lock l(*ensemble_resource->get_mutex());
-      DoCompute(context, ensemble_resource);
+      DoCompute(context, ensemble_resource,
+                /*return_output_leaf_index=*/false);
     } else {
-      DoCompute(context, ensemble_resource);
+      DoCompute(context, ensemble_resource,
+                /*return_output_leaf_index=*/false);
     }
   }
 
- private:
-  void DoCompute(OpKernelContext* context,
-                 DecisionTreeEnsembleResource* ensemble_resource) {
+ protected:
+  // return_output_leaf_index is a boolean variable indicating whether to output
+  // leaf index in prediction. Though this class invokes only with this param
+  // value as false, the subclass GradientTreesPredictionVerboseOp will invoke
+  // with the true value.
+  virtual void DoCompute(OpKernelContext* context,
+                         DecisionTreeEnsembleResource* ensemble_resource,
+                         const bool return_output_leaf_index) {
     // Read dense float features list;
     OpInputList dense_float_features_list;
     OP_REQUIRES_OK(context, TensorUtils::ReadDenseFloatFeatures(
@@ -267,6 +275,14 @@ class GradientTreesPredictionOp : public OpKernel {
                                           &output_predictions_t));
     auto output_predictions = output_predictions_t->matrix<float>();
 
+    // Allocate output leaf index matrix.
+    Tensor* output_leaf_index_t = nullptr;
+    if (return_output_leaf_index) {
+      OP_REQUIRES_OK(context, context->allocate_output(
+                                  kLeafIndexTensorName,
+                                  {batch_size, ensemble_resource->num_trees()},
+                                  &output_leaf_index_t));
+    }
     // Run predictor.
     thread::ThreadPool* const worker_threads =
         context->device()->tensorflow_cpu_worker_threads()->workers;
@@ -288,11 +304,13 @@ class GradientTreesPredictionOp : public OpKernel {
             i, weight * (num_ensembles - i + start_averaging) / num_ensembles);
       }
       MultipleAdditiveTrees::Predict(adjusted, trees_to_include, batch_features,
-                                     worker_threads, output_predictions);
+                                     worker_threads, output_predictions,
+                                     output_leaf_index_t);
     } else {
       MultipleAdditiveTrees::Predict(
           ensemble_resource->decision_tree_ensemble(), trees_to_include,
-          batch_features, worker_threads, output_predictions);
+          batch_features, worker_threads, output_predictions,
+          output_leaf_index_t);
     }
 
     // Output dropped trees and original weights.
@@ -302,7 +320,6 @@ class GradientTreesPredictionOp : public OpKernel {
                                 {2, static_cast<int64>(dropped_trees.size())},
                                 &output_dropout_info_t));
     auto output_dropout_info = output_dropout_info_t->matrix<float>();
-
     for (int32 i = 0; i < dropped_trees.size(); ++i) {
       output_dropout_info(0, i) = dropped_trees[i];
       output_dropout_info(1, i) = original_weights[i];
@@ -326,6 +343,27 @@ class GradientTreesPredictionOp : public OpKernel {
 REGISTER_KERNEL_BUILDER(Name("GradientTreesPrediction").Device(DEVICE_CPU),
                         GradientTreesPredictionOp);
 
+// GradientTreesPredictionVerboseOp is derived from GradientTreesPredictionOp
+// and have an additional output of tensor of rank 2 containing leaf ids for
+// each tree where an instance ended up with.
+class GradientTreesPredictionVerboseOp : public GradientTreesPredictionOp {
+ public:
+  explicit GradientTreesPredictionVerboseOp(OpKernelConstruction* const context)
+      : GradientTreesPredictionOp(context) {}
+
+ protected:
+  void DoCompute(OpKernelContext* context,
+                 DecisionTreeEnsembleResource* ensemble_resource,
+                 bool return_output_leaf_index) override {
+    GradientTreesPredictionOp::DoCompute(context, ensemble_resource,
+                                         /*return_output_leaf_index=*/true);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("GradientTreesPredictionVerbose").Device(DEVICE_CPU),
+    GradientTreesPredictionVerboseOp);
+
 class GradientTreesPartitionExamplesOp : public OpKernel {
  public:
   explicit GradientTreesPartitionExamplesOp(OpKernelConstruction* const context)
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
index 43b00d4c6d..c9223afeab 100644
--- a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.cc
@@ -26,7 +26,8 @@ void MultipleAdditiveTrees::Predict(
     const std::vector<int32>& trees_to_include,
     const boosted_trees::utils::BatchFeatures& features,
     tensorflow::thread::ThreadPool* const worker_threads,
-    tensorflow::TTypes<float>::Matrix output_predictions) {
+    tensorflow::TTypes<float>::Matrix output_predictions,
+    Tensor* const output_leaf_index) {
   // Zero out predictions as the model is additive.
   output_predictions.setZero();
 
@@ -38,8 +39,13 @@ void MultipleAdditiveTrees::Predict(
 
   // Lambda for doing a block of work.
   auto update_predictions = [&config, &features, &trees_to_include,
-                             &output_predictions](int64 start, int64 end) {
+                             &output_predictions,
+                             &output_leaf_index](int64 start, int64 end) {
     auto examples_iterable = features.examples_iterable(start, end);
+    Tensor dummy_tensor(DT_INT32, TensorShape({1, 1}));
+    tensorflow::TTypes<int>::Matrix output_leaf_index_mat =
+        output_leaf_index != nullptr ? output_leaf_index->matrix<int>()
+                                     : dummy_tensor.matrix<int>();
     for (const auto& example : examples_iterable) {
       for (const int32 tree_idx : trees_to_include) {
         const boosted_trees::trees::DecisionTreeConfig& tree =
@@ -47,6 +53,10 @@ void MultipleAdditiveTrees::Predict(
         const float tree_weight = config.tree_weights(tree_idx);
         const int leaf_idx = trees::DecisionTree::Traverse(tree, 0, example);
         QCHECK(leaf_idx >= 0) << "Invalid tree: " << tree.DebugString();
+        // Checks if output leaf tree index is required.
+        if (output_leaf_index != nullptr) {
+          output_leaf_index_mat(example.example_idx, tree_idx) = leaf_idx;
+        }
         const auto& leaf_node = tree.nodes(leaf_idx);
         QCHECK(leaf_node.has_leaf())
             << "Invalid leaf node: " << leaf_node.DebugString();
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
index cc3dc226cd..940531c4ba 100644
--- a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees.h
@@ -33,12 +33,17 @@ class MultipleAdditiveTrees {
  public:
   // Predict runs tree ensemble on the given batch and updates
   // output predictions accordingly, for the given list of trees.
+  // output_leaf_indices is a pointer to a 2 dimensional tensor. If it is not
+  // nullptr, this method fills output_leaf_indices with a per-tree leaf id
+  // where each of the instances from 'features' ended up in. Its shape is num
+  // examples X num of trees.
   static void Predict(
       const boosted_trees::trees::DecisionTreeEnsembleConfig& config,
       const std::vector<int32>& trees_to_include,
       const boosted_trees::utils::BatchFeatures& features,
       tensorflow::thread::ThreadPool* const worker_threads,
-      tensorflow::TTypes<float>::Matrix output_predictions);
+      tensorflow::TTypes<float>::Matrix output_predictions,
+      Tensor* const output_leaf_index);
 };
 
 }  // namespace models
diff --git a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
index 4ca18bedb1..462a9ac86f 100644
--- a/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
+++ b/tensorflow/contrib/boosted_trees/lib/models/multiple_additive_trees_test.cc
@@ -62,7 +62,8 @@ TEST_F(MultipleAdditiveTreesTest, Empty) {
   tensorflow::thread::ThreadPool threads(tensorflow::Env::Default(), "test",
                                          kNumThreadsSingleThreaded);
   MultipleAdditiveTrees::Predict(tree_ensemble_config, {}, batch_features_,
-                                 &threads, output_matrix);
+                                 &threads, output_matrix,
+                                 /*output_leaf_index=*/nullptr);
   EXPECT_EQ(0, output_matrix(0, 0));
   EXPECT_EQ(0, output_matrix(1, 0));
 }
@@ -99,17 +100,38 @@ TEST_F(MultipleAdditiveTreesTest, SingleClass) {
   // Normal case.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {0, 1},
-                                   batch_features_, &threads, output_matrix);
+                                   batch_features_, &threads, output_matrix,
+                                   /*output_leaf_index=*/nullptr);
     EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
     EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1).
   }
+  // Normal case with leaf node.
+  {
+    // Initialize output leaf index tensor, since leaf index is positive in this
+    // case, initialize with the value of -1. Since there are 2 examples and
+    // there are 2 trees, initialize leaf output index by 2 * 2.
+    Tensor output_leaf_index_tensor(DT_INT32, TensorShape({2, 2}));
+    MultipleAdditiveTrees::Predict(tree_ensemble_config, {0, 1},
+                                   batch_features_, &threads, output_matrix,
+                                   &output_leaf_index_tensor);
+    EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (bias) + 0.2 (leaf 2).
+    EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1).
+    EXPECT_FLOAT_EQ(0, output_leaf_index_tensor.matrix<int>()(
+                           0, 0));  // 1st leaf for the first example
+    EXPECT_FLOAT_EQ(0, output_leaf_index_tensor.matrix<int>()(
+                           1, 0));  // 1st leaf for the second example
+    EXPECT_FLOAT_EQ(2, output_leaf_index_tensor.matrix<int>()(
+                           0, 1));  // 2nd leaf for the first example
+    EXPECT_FLOAT_EQ(1, output_leaf_index_tensor.matrix<int>()(
+                           1, 1));  // 2nd leaf for the second example
+  }
   // Weighted case
   {
     DecisionTreeEnsembleConfig weighted = tree_ensemble_config;
     weighted.set_tree_weights(0, 6.0);
     weighted.set_tree_weights(1, 3.2);
     MultipleAdditiveTrees::Predict(weighted, {0, 1}, batch_features_, &threads,
-                                   output_matrix);
+                                   output_matrix, nullptr);
     // -0.4 (bias) + 0.2 (leaf 2).
     EXPECT_FLOAT_EQ(-0.4f * 6 + 0.2 * 3.2, output_matrix(0, 0));
     // -0.4 (bias) + 0.9 (leaf 1).
@@ -118,21 +140,21 @@ TEST_F(MultipleAdditiveTreesTest, SingleClass) {
   // Drop first tree.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {1}, batch_features_,
-                                   &threads, output_matrix);
+                                   &threads, output_matrix, nullptr);
     EXPECT_FLOAT_EQ(0.2f, output_matrix(0, 0));  // 0.2 (leaf 2).
     EXPECT_FLOAT_EQ(0.9f, output_matrix(1, 0));  // 0.9 (leaf 1).
   }
   // Drop second tree.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {0}, batch_features_,
-                                   &threads, output_matrix);
+                                   &threads, output_matrix, nullptr);
     EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias).
     EXPECT_FLOAT_EQ(-0.4f, output_matrix(1, 0));  // -0.4 (bias).
   }
   // Drop all trees.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {}, batch_features_,
-                                   &threads, output_matrix);
+                                   &threads, output_matrix, nullptr);
     EXPECT_FLOAT_EQ(0.0, output_matrix(0, 0));
     EXPECT_FLOAT_EQ(0.0, output_matrix(1, 0));
   }
@@ -172,7 +194,8 @@ TEST_F(MultipleAdditiveTreesTest, MultiClass) {
   // Normal case.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {0, 1},
-                                   batch_features_, &threads, output_matrix);
+                                   batch_features_, &threads, output_matrix,
+                                   nullptr);
     EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias)
     EXPECT_FLOAT_EQ(-0.5f, output_matrix(0, 1));  // -0.7 (bias) + 0.2 (leaf 2)
     EXPECT_FLOAT_EQ(0.5f, output_matrix(1, 0));   // -0.4 (bias) + 0.9 (leaf 1)
@@ -184,7 +207,7 @@ TEST_F(MultipleAdditiveTreesTest, MultiClass) {
     weighted.set_tree_weights(0, 6.0);
     weighted.set_tree_weights(1, 3.2);
     MultipleAdditiveTrees::Predict(weighted, {0, 1}, batch_features_, &threads,
-                                   output_matrix);
+                                   output_matrix, nullptr);
     // bias
     EXPECT_FLOAT_EQ(-0.4f * 6, output_matrix(0, 0));
     // bias + leaf 2
@@ -197,7 +220,7 @@ TEST_F(MultipleAdditiveTreesTest, MultiClass) {
   // Dropout first tree.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {1}, batch_features_,
-                                   &threads, output_matrix);
+                                   &threads, output_matrix, nullptr);
     EXPECT_FLOAT_EQ(0.0, output_matrix(0, 0));
     EXPECT_FLOAT_EQ(0.2f, output_matrix(0, 1));  // 0.2 (leaf 2)
     EXPECT_FLOAT_EQ(0.9f, output_matrix(1, 0));  // 0.9 (leaf 2)
@@ -206,7 +229,7 @@ TEST_F(MultipleAdditiveTreesTest, MultiClass) {
   // Dropout second tree.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {0}, batch_features_,
-                                   &threads, output_matrix);
+                                   &threads, output_matrix, nullptr);
     EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 0));  // -0.4 (bias)
     EXPECT_FLOAT_EQ(-0.7f, output_matrix(0, 1));  // -0.7 (bias)
     EXPECT_FLOAT_EQ(-0.4f, output_matrix(1, 0));  // -0.4 (bias)
@@ -215,7 +238,7 @@ TEST_F(MultipleAdditiveTreesTest, MultiClass) {
   // Drop both trees.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {}, batch_features_,
-                                   &threads, output_matrix);
+                                   &threads, output_matrix, nullptr);
     EXPECT_FLOAT_EQ(0.0f, output_matrix(0, 0));
     EXPECT_FLOAT_EQ(0.0f, output_matrix(0, 1));
     EXPECT_FLOAT_EQ(0.0f, output_matrix(1, 0));
@@ -258,7 +281,8 @@ TEST_F(MultipleAdditiveTreesTest, DenseLeaves) {
   // Normal case.
   {
     MultipleAdditiveTrees::Predict(tree_ensemble_config, {0, 1},
-                                   batch_features_, &threads, output_matrix);
+                                   batch_features_, &threads, output_matrix,
+                                   nullptr);
     EXPECT_FLOAT_EQ(-0.2f, output_matrix(0, 0));  // -0.4 (tree1) + 0.2 (leaf 2)
     EXPECT_FLOAT_EQ(-0.4f, output_matrix(0, 1));  // -0.7 (tree1) + 0.3 (leaf 2)
     EXPECT_FLOAT_EQ(3.4f, output_matrix(0, 2));   // 3.0 -(tree1) + 0.4 (leaf 2)
diff --git a/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc b/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
index d66f645f62..6491d58794 100644
--- a/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
+++ b/tensorflow/contrib/boosted_trees/ops/prediction_ops.cc
@@ -40,6 +40,24 @@ static Status ApplyGradientTreesPredictionShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+static Status ApplyGradientTreesPredictionVerboseShapeFn(InferenceContext* c) {
+  string learner_config_str;
+  c->GetAttr("learner_config", &learner_config_str).IgnoreError();
+  LearnerConfig learner_config;
+  ParseProtoUnlimited(&learner_config, learner_config_str);
+
+  bool reduce_dim;
+  c->GetAttr("reduce_dim", &reduce_dim).IgnoreError();
+  // Sets the shape of the output as a matrix.
+  c->set_output(0, {c->Matrix(InferenceContext::kUnknownDim,
+                              reduce_dim ? learner_config.num_classes() - 1
+                                         : learner_config.num_classes())});
+  c->set_output(1, {c->UnknownShape()});
+  c->set_output(2, {c->Matrix(InferenceContext::kUnknownDim,
+                              InferenceContext::kUnknownDim)});
+  return Status::OK();
+}
+
 REGISTER_OP("GradientTreesPrediction")
     .Attr("learner_config: string")
     .Attr("num_dense_float_features: int >= 0")
@@ -90,6 +108,58 @@ drop_out_tree_indices_weights: Tensor of Rank 2 containing dropped trees indices
 and original weights of those trees during prediction.
 )doc");
 
+REGISTER_OP("GradientTreesPredictionVerbose")
+    .Attr("learner_config: string")
+    .Attr("num_dense_float_features: int >= 0")
+    .Attr("num_sparse_float_features: int >= 0")
+    .Attr("num_sparse_int_features: int >= 0")
+    .Attr("use_locking: bool = false")
+    .Attr("apply_dropout: bool")
+    .Attr("apply_averaging: bool")
+    .Attr("center_bias: bool")
+    .Attr("reduce_dim: bool")
+    .Input("tree_ensemble_handle: resource")
+    .Input("seed: int64")
+    .Input("dense_float_features: num_dense_float_features * float")
+    .Input("sparse_float_feature_indices: num_sparse_float_features * int64")
+    .Input("sparse_float_feature_values: num_sparse_float_features * float")
+    .Input("sparse_float_feature_shapes: num_sparse_float_features * int64")
+    .Input("sparse_int_feature_indices: num_sparse_int_features * int64")
+    .Input("sparse_int_feature_values: num_sparse_int_features * int64")
+    .Input("sparse_int_feature_shapes: num_sparse_int_features * int64")
+    .Output("predictions: float")
+    .Output("drop_out_tree_indices_weights: float")
+    .Output("leaf_index: int32")
+    .SetShapeFn(ApplyGradientTreesPredictionVerboseShapeFn)
+    .Doc(R"doc(
+Runs multiple additive regression forests predictors on input instances
+and computes the final prediction for each class, and outputs a matrix of
+leaf ids per each tree in an ensemble.
+
+learner_config: Config for the learner of type LearnerConfig proto. Prediction
+ops for now uses only LearningRateDropoutDrivenConfig config from the learner.
+num_dense_float_features: Number of dense float features.
+num_sparse_float_features: Number of sparse float features.
+num_sparse_int_features: Number of sparse int features.
+use_locking: Whether to use locking.
+seed: random seed to be used for dropout.
+reduce_dim: whether to reduce the dimension (legacy impl) or not.
+apply_dropout: whether to apply dropout during prediction.
+apply_averaging: whether averaging of tree ensembles should take place. If set
+to true, will be based on AveragingConfig from learner_config.
+tree_ensemble_handle: The handle to the tree ensemble.
+dense_float_features: Rank 2 Tensors containing dense float feature values.
+sparse_float_feature_indices: Rank 2 Tensors containing sparse float indices.
+sparse_float_feature_values: Rank 1 Tensors containing sparse float values.
+sparse_float_feature_shapes: Rank 1 Tensors containing sparse float shapes.
+sparse_int_feature_indices: Rank 2 Tensors containing sparse int indices.
+sparse_int_feature_values: Rank 1 Tensors containing sparse int values.
+sparse_int_feature_shapes: Rank 1 Tensors containing sparse int shapes.
+predictions: Rank 2 Tensor containing predictions per example per class.
+drop_out_tree_indices_weights: Tensor of Rank 2 containing dropped trees indices
+leaf_index: tensor of rank 2 containing leaf ids for each tree where an instance ended up.
+)doc");
+
 REGISTER_OP("GradientTreesPartitionExamples")
     .Attr("num_dense_float_features: int >= 0")
     .Attr("num_sparse_float_features: int >= 0")
diff --git a/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py b/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
index 58f0d36b0f..7f6e55ae58 100644
--- a/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
+++ b/tensorflow/contrib/boosted_trees/python/ops/prediction_ops.py
@@ -21,4 +21,5 @@ from __future__ import print_function
 from tensorflow.contrib.boosted_trees.python.ops import boosted_trees_ops_loader
 from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import gradient_trees_partition_examples
 from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import gradient_trees_prediction
+from tensorflow.contrib.boosted_trees.python.ops.gen_prediction_ops import gradient_trees_prediction_verbose
 # pylint: enable=unused-import
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index 5dd2e0c7f2..47698d45c8 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -58,6 +58,7 @@ NUM_LAYERS_ATTEMPTED = "num_layers"
 NUM_TREES_ATTEMPTED = "num_trees"
 NUM_USED_HANDLERS = "num_used_handlers"
 USED_HANDLERS_MASK = "used_handlers_mask"
+LEAF_INDEX = "leaf_index"
 _FEATURE_NAME_TEMPLATE = "%s_%d"
 
 
@@ -71,18 +72,24 @@ def _get_column_by_index(tensor, indices):
   return array_ops.reshape(array_ops.gather(p_flat, i_flat), [shape[0], -1])
 
 
-def _make_predictions_dict(stamp, logits, partition_ids, ensemble_stats,
-                           used_handlers):
+def _make_predictions_dict(stamp,
+                           logits,
+                           partition_ids,
+                           ensemble_stats,
+                           used_handlers,
+                           leaf_index=None):
   """Returns predictions for the given logits and n_classes.
 
   Args:
     stamp: The ensemble stamp.
-    logits: A rank 2 `Tensor` with shape [batch_size, n_classes - 1].
-        that contains predictions when no dropout was applied.
+    logits: A rank 2 `Tensor` with shape [batch_size, n_classes - 1]. that
+      contains predictions when no dropout was applied.
     partition_ids: A rank 1 `Tensor` with shape [batch_size].
     ensemble_stats: A TreeEnsembleStatsOp result tuple.
     used_handlers: A TreeEnsembleUsedHandlerOp result tuple of an int and a
-        boolean mask..
+      boolean mask.
+    leaf_index: A rank 2 `Tensor` with shape [batch_size, number of trees]. that
+      contains leaf id for each example prediction.
 
   Returns:
     A dict of predictions.
@@ -95,6 +102,8 @@ def _make_predictions_dict(stamp, logits, partition_ids, ensemble_stats,
   result[NUM_TREES_ATTEMPTED] = ensemble_stats.attempted_trees
   result[NUM_USED_HANDLERS] = used_handlers.num_used_handlers
   result[USED_HANDLERS_MASK] = used_handlers.used_handlers_mask
+  if leaf_index is not None:
+    result[LEAF_INDEX] = leaf_index
   return result
 
 
@@ -268,7 +277,8 @@ class GradientBoostedDecisionTreeModel(object):
                features,
                logits_dimension,
                feature_columns=None,
-               use_core_columns=False):
+               use_core_columns=False,
+               output_leaf_index=False):
     """Construct a new GradientBoostedDecisionTreeModel function.
 
     Args:
@@ -276,13 +286,15 @@ class GradientBoostedDecisionTreeModel(object):
       num_ps_replicas: Number of parameter server replicas, can be 0.
       ensemble_handle: A handle to the ensemble variable.
       center_bias: Whether to center the bias before growing trees.
-      examples_per_layer: Number of examples to accumulate before growing
-        a tree layer. It can also be a function that computes the number of
-        examples based on the depth of the layer that's being built.
+      examples_per_layer: Number of examples to accumulate before growing a tree
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
       learner_config: A learner config.
       features: `dict` of `Tensor` objects.
       logits_dimension: An int, the dimension of logits.
       feature_columns: A list of feature columns.
+      output_leaf_index: A boolean variable indicating whether to output leaf
+        index into predictions dictionary.
 
     Raises:
       ValueError: if inputs are not valid.
@@ -359,6 +371,7 @@ class GradientBoostedDecisionTreeModel(object):
         self._learner_config.multi_class_strategy ==
         learner_pb2.LearnerConfig.TREE_PER_CLASS and
         learner_config.num_classes == 2)
+    self._output_leaf_index = output_leaf_index
 
   def _predict_and_return_dict(self, ensemble_handle, ensemble_stamp, mode):
     """Runs prediction and returns a dictionary of the prediction results.
@@ -388,22 +401,44 @@ class GradientBoostedDecisionTreeModel(object):
     # Make sure ensemble stats run. This will check that the ensemble has
     # the right stamp.
     with ops.control_dependencies(ensemble_stats):
-      predictions, _ = prediction_ops.gradient_trees_prediction(
-          ensemble_handle,
-          seed,
-          self._dense_floats,
-          self._sparse_float_indices,
-          self._sparse_float_values,
-          self._sparse_float_shapes,
-          self._sparse_int_indices,
-          self._sparse_int_values,
-          self._sparse_int_shapes,
-          learner_config=self._learner_config_serialized,
-          apply_dropout=apply_dropout,
-          apply_averaging=mode != learn.ModeKeys.TRAIN,
-          use_locking=True,
-          center_bias=self._center_bias,
-          reduce_dim=self._reduce_dim)
+      leaf_index = None
+      # Only used in infer (predict), not used in train and eval.
+      if self._output_leaf_index and mode == learn.ModeKeys.INFER:
+        predictions, _, leaf_index = (
+            prediction_ops).gradient_trees_prediction_verbose(
+                ensemble_handle,
+                seed,
+                self._dense_floats,
+                self._sparse_float_indices,
+                self._sparse_float_values,
+                self._sparse_float_shapes,
+                self._sparse_int_indices,
+                self._sparse_int_values,
+                self._sparse_int_shapes,
+                learner_config=self._learner_config_serialized,
+                apply_dropout=apply_dropout,
+                apply_averaging=mode != learn.ModeKeys.TRAIN,
+                use_locking=True,
+                center_bias=self._center_bias,
+                reduce_dim=self._reduce_dim)
+      else:
+        leaf_index = None
+        predictions, _ = prediction_ops.gradient_trees_prediction(
+            ensemble_handle,
+            seed,
+            self._dense_floats,
+            self._sparse_float_indices,
+            self._sparse_float_values,
+            self._sparse_float_shapes,
+            self._sparse_int_indices,
+            self._sparse_int_values,
+            self._sparse_int_shapes,
+            learner_config=self._learner_config_serialized,
+            apply_dropout=apply_dropout,
+            apply_averaging=mode != learn.ModeKeys.TRAIN,
+            use_locking=True,
+            center_bias=self._center_bias,
+            reduce_dim=self._reduce_dim)
       partition_ids = prediction_ops.gradient_trees_partition_examples(
           ensemble_handle,
           self._dense_floats,
@@ -416,7 +451,7 @@ class GradientBoostedDecisionTreeModel(object):
           use_locking=True)
 
     return _make_predictions_dict(ensemble_stamp, predictions, partition_ids,
-                                  ensemble_stats, used_handlers)
+                                  ensemble_stats, used_handlers, leaf_index)
 
   def predict(self, mode):
     """Returns predictions given the features and mode.
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
index 289fb195db..e3d4397fad 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch_test.py
@@ -19,18 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 from google.protobuf import text_format
-
 from tensorflow.contrib import layers
 from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.boosted_trees.proto import tree_config_pb2
 from tensorflow.contrib.boosted_trees.python.ops import model_ops
 from tensorflow.contrib.boosted_trees.python.training.functions import gbdt_batch
 from tensorflow.contrib.boosted_trees.python.utils import losses
-
-from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.contrib.layers.python.layers import feature_column as feature_column_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-
+from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -782,6 +779,118 @@ class GbdtTest(test_util.TensorFlowTestCase):
                           [[0.25], [0.25], [0.25], [0.25]])
       self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
 
+  def testPredictFnWithLeafIndexAdvancedLeft(self):
+    """Tests the predict function with output leaf ids."""
+    with self.test_session() as sess:
+      # Create ensemble with one bias node.
+      ensemble_config = tree_config_pb2.DecisionTreeEnsembleConfig()
+      text_format.Merge(
+          """
+          trees {
+            nodes {
+                dense_float_binary_split {
+                  threshold: 1.0
+                  left_id: 1
+                  right_id: 2
+                }
+                node_metadata {
+                  gain: 0
+                }
+              }
+              nodes {
+                leaf {
+                  vector {
+                    value: 0.25
+                  }
+                }
+              }
+              nodes {
+                leaf {
+                  vector {
+                    value: 0.15
+                  }
+                }
+              }
+          }
+          trees {
+            nodes {
+                dense_float_binary_split {
+                  threshold: 0.99
+                  left_id: 1
+                  right_id: 2
+                }
+                node_metadata {
+                  gain: 00
+                }
+              }
+              nodes {
+                leaf {
+                  vector {
+                    value: 0.25
+                  }
+                }
+              }
+              nodes {
+                leaf {
+                  vector {
+                    value: 0.23
+                  }
+                }
+              }
+          }
+          tree_weights: 1.0
+          tree_weights: 1.0
+          tree_metadata {
+            num_tree_weight_updates: 1
+            num_layers_grown: 1
+            is_finalized: true
+          }
+          tree_metadata {
+            num_tree_weight_updates: 1
+            num_layers_grown: 1
+            is_finalized: true
+          }""", ensemble_config)
+      ensemble_handle = model_ops.tree_ensemble_variable(
+          stamp_token=3,
+          tree_ensemble_config=ensemble_config.SerializeToString(),
+          name="tree_ensemble")
+      resources.initialize_resources(resources.shared_resources()).run()
+      learner_config = learner_pb2.LearnerConfig()
+      learner_config.learning_rate_tuner.fixed.learning_rate = 0.1
+      learner_config.num_classes = 2
+      learner_config.regularization.l1 = 0
+      learner_config.regularization.l2 = 0
+      learner_config.constraints.max_tree_depth = 1
+      learner_config.constraints.min_node_weight = 0
+      features = {}
+      features["dense_float"] = array_ops.constant(
+          [[0.0], [1.0], [1.1], [2.0]], dtype=dtypes.float32)
+      gbdt_model = gbdt_batch.GradientBoostedDecisionTreeModel(
+          is_chief=False,
+          num_ps_replicas=0,
+          center_bias=True,
+          ensemble_handle=ensemble_handle,
+          examples_per_layer=1,
+          learner_config=learner_config,
+          logits_dimension=1,
+          features=features,
+          output_leaf_index=True)
+
+      # Create predict op.
+      mode = model_fn.ModeKeys.INFER
+      predictions_dict = sess.run(gbdt_model.predict(mode))
+      self.assertEquals(predictions_dict["ensemble_stamp"], 3)
+      # here are how the numbers in expected results are calculated,
+      # 0.5 = 0.25 + 0.25
+      # 0.48 = 0.25 + 0.23
+      # 0.38 = 0.15 + 0.23
+      # 0.38 = 0.15 + 0.23
+      self.assertAllClose(predictions_dict["predictions"],
+                          [[0.5], [0.48], [0.38], [0.38]])
+      self.assertAllClose(predictions_dict["partition_ids"], [0, 0, 0, 0])
+      self.assertAllClose(predictions_dict["leaf_index"],
+                          [[1, 1], [1, 2], [2, 2], [2, 2]])
+
   def testTrainFnMulticlassFullHessian(self):
     """Tests the GBDT train for multiclass full hessian."""
     with self.test_session() as sess:
-- 
GitLab


From 8b460629e51356485d4da80d81f22e5911a64788 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Wed, 6 Jun 2018 12:37:18 -0700
Subject: [PATCH 385/610] Fixes eager safety problems with tf.contrib.lookup

PiperOrigin-RevId: 199511303
---
 tensorflow/contrib/lookup/lookup_ops_test.py | 20 ++++++++++++++------
 tensorflow/python/ops/lookup_ops.py          |  8 ++++++++
 2 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 5d4682ec9f..5a080cceab 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -24,6 +24,7 @@ import six
 
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
@@ -1396,15 +1397,22 @@ class KeyValueTensorInitializerTest(test.TestCase):
 
 class IndexTableFromTensor(test.TestCase):
 
+  @test_util.run_in_graph_and_eager_modes()
   def test_index_table_from_tensor_with_tensor_init(self):
-    with self.test_session():
+    table = lookup.index_table_from_tensor(
+        mapping=("brain", "salad", "surgery"), num_oov_buckets=1)
+
+    if not context.executing_eagerly():
+      with self.assertRaises(errors_impl.OpError):
+        self.evaluate(table.lookup(
+            constant_op.constant(("salad", "surgery", "tarkus"))))
+    else:
+      # Reinitializing a table in eager should work.
       table = lookup.index_table_from_tensor(
           mapping=("brain", "salad", "surgery"), num_oov_buckets=1)
-      ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
-
-      self.assertRaises(errors_impl.OpError, ids.eval)
-      lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+    self.evaluate(lookup_ops.tables_initializer())
+    ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
+    self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
   def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 0e547689cc..fb51fbc626 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -366,6 +366,10 @@ class KeyValueTensorInitializer(TableInitializerBase):
     with ops.name_scope(
         self._name, values=(table.table_ref, self._keys,
                             self._values)) as scope:
+      if context.executing_eagerly():
+        # Ensure a unique name when eager execution is enabled to avoid spurious
+        # sharing issues.
+        scope += str(ops.uid())
       init_op = gen_lookup_ops.initialize_table_v2(
           table.table_ref, self._keys, self._values, name=scope)
     ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op)
@@ -1108,6 +1112,10 @@ def index_table_from_tensor(vocabulary_list,
 
     shared_name = ""
     with ops.name_scope(None, "hash_table") as hash_table_scope:
+      if context.executing_eagerly():
+        # Ensure a unique name when eager execution is enabled to avoid spurious
+        # sharing issues.
+        shared_name += str(ops.uid())
       table_keys = math_ops.to_int64(keys) if keys.dtype.is_integer else keys
       init = KeyValueTensorInitializer(
           table_keys,
-- 
GitLab


From 8f2e5f0b4a0221ca1573a40a68077326a32c9bc0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 12:39:44 -0700
Subject: [PATCH 386/610] [TF:XLA] Add a implementation of RandomShuffle.

PiperOrigin-RevId: 199511721
---
 tensorflow/compiler/tests/BUILD               |  2 +
 tensorflow/compiler/tests/random_ops_test.py  | 38 ++++++--
 .../compiler/tf2xla/kernels/random_ops.cc     | 92 +++++++++++++++++++
 3 files changed, 126 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index b51c11bf6e..e6c92f9720 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -545,7 +545,9 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:random_ops",
     ],
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 70be22936a..f13dff9620 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -22,6 +22,8 @@ import numpy as np
 
 from tensorflow.compiler.tests.xla_test import XLATestCase
 from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import googletest
 
@@ -47,18 +49,18 @@ class RandomOpsTest(XLATestCase):
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
       self.assertTrue((not np.array_equal(y, z)) or
-                      (not np.array_equal(z, w)) or
-                      (not np.array_equal(y, w)))
+                      (not np.array_equal(z, w)) or (not np.array_equal(y, w)))
 
   def testRandomUniformIsNotConstant(self):
+
     def rng(dtype):
-      return random_ops.random_uniform(shape=[2], dtype=dtype,
-                                       maxval=1000000)
+      return random_ops.random_uniform(shape=[2], dtype=dtype, maxval=1000000)
 
     for dtype in self._random_types():
       self._testRngIsNotConstant(rng, dtype)
 
   def testRandomNormalIsNotConstant(self):
+
     def rng(dtype):
       return random_ops.random_normal(shape=[2], dtype=dtype)
 
@@ -70,13 +72,14 @@ class RandomOpsTest(XLATestCase):
     for dtype in self._random_types():
       with self.test_session() as sess:
         with self.test_scope():
-          x = random_ops.random_uniform(shape=[1000], dtype=dtype, minval=-2,
-                                        maxval=33)
+          x = random_ops.random_uniform(
+              shape=[1000], dtype=dtype, minval=-2, maxval=33)
         y = sess.run(x)
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
   def testTruncatedNormalIsNotConstant(self):
+
     def rng(dtype):
       return random_ops.truncated_normal(shape=[2], dtype=dtype)
 
@@ -94,6 +97,29 @@ class RandomOpsTest(XLATestCase):
         self.assertTrue((y >= -2).sum() == count)
         self.assertTrue((y <= 2).sum() == count)
 
+  def testShuffle1d(self):
+    with self.test_session() as sess:
+      with self.test_scope():
+        x = math_ops.range(20)
+        shuffle = random_ops.random_shuffle(x)
+      result = sess.run(shuffle)
+      expected = range(20)
+      # Compare sets to avoid randomness behavior changes but make sure still
+      # have all the values.
+      self.assertAllEqual(set(result), set(expected))
+
+  def testShuffle2d(self):
+    with self.test_session() as sess:
+      with self.test_scope():
+        x = array_ops.diag(math_ops.range(20))
+        shuffle = random_ops.random_shuffle(x)
+      result = sess.run(shuffle)
+      expected = np.diag(range(20)).flatten()
+      # Compare sets to avoid randomness behavior changes but make sure still
+      # have all the values.
+      self.assertAllEqual(len(result.flatten()), len(expected))
+      self.assertAllEqual(set(result.flatten()), set(expected))
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 39149d56ad..ebac5c4396 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -17,6 +17,8 @@ limitations under the License.
 // TODO(misard,phawkins): handle random number generator seeds/states correctly.
 // TODO(misard,phawkins): add tests.
 
+#include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -56,6 +58,96 @@ class RandomUniformOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("RandomUniform").CompileTimeConstInput("shape"),
                 RandomUniformOp);
 
+class RandomShuffleOp : public XlaOpKernel {
+ public:
+  explicit RandomShuffleOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    auto builder = ctx->builder();
+    xla::XlaOp input = ctx->Input(0);
+    TensorShape input_shape = ctx->InputShape(0);
+    const int64 n = input_shape.dim_size(0);
+    int64 num_elements = 1;
+    for (tensorflow::TensorShapeDim dimension : input_shape) {
+      num_elements *= dimension.size;
+    }
+    if (num_elements <= 1 || n <= 1) {
+      // No shuffling is required, so copy input directly to output
+      ctx->SetOutput(0, input);
+    } else {
+      // Generate the random swaps for the indices.
+      auto zero = builder->Broadcast(
+          builder->ConstantLiteral(xla::Literal::Zero(xla::S32)),
+          gtl::ArraySlice<int64>({n}));
+      auto n_maxval = builder->Broadcast(builder->ConstantR0<int32>(n),
+                                         gtl::ArraySlice<int64>({n}));
+      auto swaps_shape = xla::ShapeUtil::MakeShape(xla::S32, {n});
+      auto swaps = builder->RngUniform(zero, n_maxval, swaps_shape);
+
+      // Generate range(n) as the initial value for the indices to be swapped.
+      auto index_init_body_fn = [&](xla::XlaOp i,
+                                    gtl::ArraySlice<xla::XlaOp> loop_vars,
+                                    xla::XlaBuilder* builder)
+          -> xla::StatusOr<std::vector<xla::XlaOp>> {
+        auto indices = loop_vars[0];
+        i = builder->Reshape(i, {}, {1});
+        // indices[i] = i
+        indices = builder->DynamicUpdateSlice(indices, i, i);
+        return std::vector<xla::XlaOp>{indices};
+      };
+      // for i in range(n):
+      xla::XlaOp index_zeros = Zeros(builder, swaps_shape);
+      auto index_init_loop_result =
+          XlaForEachIndex(n, xla::S32, index_init_body_fn, {index_zeros},
+                          "index_init_loop", builder)
+              .ValueOrDie();
+      auto indices = index_init_loop_result[0];
+
+      // Swap the indices at i and swaps[i].
+      auto swap_body_fn = [&](xla::XlaOp i,
+                              gtl::ArraySlice<xla::XlaOp> loop_vars,
+                              xla::XlaBuilder* builder)
+          -> xla::StatusOr<std::vector<xla::XlaOp>> {
+        auto swaps = loop_vars[0];
+        auto indices = loop_vars[1];
+        i = builder->Reshape(i, {}, {1});
+        // temp = indices[i]
+        auto temp = builder->DynamicSlice(indices, i, {1});
+        // swap_index = swaps[i]
+        auto swap_index = builder->DynamicSlice(swaps, i, {1});
+        // swap_value = indices[swaps[i]]
+        auto swap_value = builder->DynamicSlice(indices, swap_index, {1});
+        // indices[i] = indices[swaps[i]]
+        indices = builder->DynamicUpdateSlice(indices, swap_value, i);
+        // indices[swaps[i]] = temp
+        indices = builder->DynamicUpdateSlice(indices, temp, swap_index);
+        return std::vector<xla::XlaOp>{swaps, indices};
+      };
+      // for i in range(n):
+      auto swap_loop_result =
+          XlaForEachIndex(n, xla::S32, swap_body_fn, {swaps, indices},
+                          "indices_swap_loop", builder)
+              .ValueOrDie();
+      auto swapped_indices = swap_loop_result[1];
+
+      // Gather the data using the swapped indices as the shuffled order.
+      auto indices_tensor_shape = TensorShape({n});
+      DataType type = ctx->expected_output_dtype(0);
+      xla::XlaOp gather;
+      OP_REQUIRES_OK(ctx, XlaGather(input, input_shape, swapped_indices,
+                                    indices_tensor_shape,
+                                    /*axis=*/0, /*indices_are_nd=*/false, type,
+                                    DT_INT32, builder, &gather));
+      ctx->SetOutput(0, gather);
+    }
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(RandomShuffleOp);
+};
+
+REGISTER_XLA_OP(Name("RandomShuffle"), RandomShuffleOp);
+
 class RandomUniformIntOp : public XlaOpKernel {
  public:
   explicit RandomUniformIntOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-- 
GitLab


From 9f1e508eab90262cf932d7ec0bfdf67cc8d69278 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Wed, 6 Jun 2018 13:35:35 -0700
Subject: [PATCH 387/610] Force downgrade setuptools for tests after tf whl is
 installed.

---
 tensorflow/tools/ci_build/builds/pip.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/builds/pip.sh b/tensorflow/tools/ci_build/builds/pip.sh
index 76210ba463..883bb93647 100755
--- a/tensorflow/tools/ci_build/builds/pip.sh
+++ b/tensorflow/tools/ci_build/builds/pip.sh
@@ -315,7 +315,6 @@ create_activate_virtualenv_and_install_tensorflow() {
   # Upgrade pip so it supports tags such as cp27mu, manylinux1 etc.
   echo "Upgrade pip in virtualenv"
   pip install --upgrade pip==9.0.1
-  pip install --upgrade setuptools==39.1.0
 
   # Force tensorflow reinstallation. Otherwise it may not get installed from
   # last build if it had the same version number as previous build.
@@ -323,6 +322,10 @@ create_activate_virtualenv_and_install_tensorflow() {
   pip install -v ${PIP_FLAGS} ${WHL_PATH} || \
     die "pip install (forcing to reinstall tensorflow) FAILED"
   echo "Successfully installed pip package ${TF_WHEEL_PATH}"
+
+  # Force downgrade setuptools.
+  pip install --upgrade setuptools==39.1.0
+
 }
 
 ################################################################################
-- 
GitLab


From 9dc20c7c2a43caeb75143f089a5da44c3fa5dfe0 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Wed, 6 Jun 2018 14:03:10 -0700
Subject: [PATCH 388/610] Support taking gradients of de-serialized cond.
 Instead of relying on the _FuncGraphs attached to the op we instead
 reconstruct the _FuncGraph from the FunctionDef using function_def_to_graph.

PiperOrigin-RevId: 199525030
---
 tensorflow/contrib/control_flow/BUILD         |  5 ++
 .../contrib/control_flow/python/cond_v2.py    | 51 +++++++++++++++----
 .../control_flow/python/cond_v2_test.py       | 43 ++++++++++++++++
 3 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/tensorflow/contrib/control_flow/BUILD b/tensorflow/contrib/control_flow/BUILD
index 746b5b5b5e..e8036d63ae 100644
--- a/tensorflow/contrib/control_flow/BUILD
+++ b/tensorflow/contrib/control_flow/BUILD
@@ -20,13 +20,16 @@ py_library(
     srcs = ["python/cond_v2.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:c_api_util",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
+        "//tensorflow/python:function_def_to_graph",
         "//tensorflow/python:functional_ops_gen",
         "//tensorflow/python:gradients",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:util",
     ],
 )
 
@@ -42,7 +45,9 @@ tf_py_test(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:training",
     ],
     grpc_enabled = True,
 )
diff --git a/tensorflow/contrib/control_flow/python/cond_v2.py b/tensorflow/contrib/control_flow/python/cond_v2.py
index 90c678d0f6..70a9af43a5 100644
--- a/tensorflow/contrib/control_flow/python/cond_v2.py
+++ b/tensorflow/contrib/control_flow/python/cond_v2.py
@@ -23,13 +23,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import function_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import function
+from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import compat
 
 
 # NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
@@ -78,20 +81,13 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
         _create_new_tf_function(false_graph),
         name=scope)
 
-    # TODO(b/79883549): if we could make Graphs from FunctionDefs, we wouldn't
-    # need this extra state. Requiring extra state also prevents the ability to
-    # take the gradient of deserialized If ops.
-    tensors[0].op._true_graph = true_graph
-    tensors[0].op._false_graph = false_graph
-
     return tensors[:num_cond_outputs]
 
 
 @ops.RegisterGradient("If")
 def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of an If op produced by cond_v2."""
-  true_graph = op._true_graph
-  false_graph = op._false_graph
+  true_graph, false_graph = _get_func_graphs(op)
 
   # Create grad functions that compute the gradient of the true/false forward
   # graphs. These functions will capture tensors from the forward pass
@@ -136,13 +132,35 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
       op.inputs[0], grad_inputs, [t.dtype for t in true_grad_graph.outputs],
       _create_new_tf_function(true_grad_graph),
       _create_new_tf_function(false_grad_graph))
-  tensors[0].op._true_graph = true_grad_graph
-  tensors[0].op._false_graph = false_grad_graph
 
   # The predicate has no gradient.
   return [None] + tensors[:num_grad_outputs]
 
 
+def _get_func_graphs(if_op):
+  """Returns `_FuncGraph`s for the input op branches.
+
+  Args:
+    if_op: The _If Operation.
+
+  Returns:
+    A 2-tuple of the `_FuncGraph`s of the then_branch and else_branch.
+  """
+  def _get_func_graph_for_branch(branch_name):
+    extra_inputs = if_op.inputs[1:]  # First input is pred.
+    input_shapes = [t.shape for t in extra_inputs]
+    func_name = if_op.get_attr(branch_name).name
+    fdef = if_op.graph._get_function(func_name).definition
+    func_graph = function_def_to_graph.function_def_to_graph(fdef, input_shapes)
+    func_graph.extra_inputs = extra_inputs
+    func_graph.extra_args = func_graph.inputs
+    func_graph._captured = dict(zip(extra_inputs, func_graph.inputs))
+    return func_graph
+
+  return (_get_func_graph_for_branch("then_branch"),
+          _get_func_graph_for_branch("else_branch"))
+
+
 def _grad_fn(func_graph, grads):
   """The gradient function for each conditional branch.
 
@@ -245,7 +263,7 @@ def _create_new_tf_function(func_graph):
   func_graph.name = "%s_" % func_graph.name
   c_func = c_api.TF_GraphToFunction_wrapper(
       func_graph._c_graph,
-      func_graph.name,
+      compat.as_str(func_graph.name),
       False,  # append_hash_to_fn_name
       None,  # opers
       [t._as_tf_output() for t in func_graph.inputs],
@@ -256,6 +274,17 @@ def _create_new_tf_function(func_graph):
   c_func = c_api_util.ScopedTFFunction(c_func)
   c_api.TF_GraphCopyFunction(
       ops.get_default_graph()._c_graph, c_func.func, None)
+
+  # Add a _DefinedFunction to `Graph._functions` of the outer graph so that
+  # we can access it using `Graph._get_function` later.
+  # TODO(srbs): Consider adding a C API that can return a FunctionDef by name.
+  with c_api_util.tf_buffer() as buffer_:
+    c_api.TF_FunctionToFunctionDef(c_func.func, buffer_)
+    proto_data = c_api.TF_GetBuffer(buffer_)
+  function_def = function_pb2.FunctionDef()
+  function_def.ParseFromString(compat.as_bytes(proto_data))
+  func_graph._outer_graph._functions[
+      func_graph.name] = function._from_definition(function_def)
   return func_graph.name
 
 
diff --git a/tensorflow/contrib/control_flow/python/cond_v2_test.py b/tensorflow/contrib/control_flow/python/cond_v2_test.py
index 166002ca7f..7e299d1ad6 100644
--- a/tensorflow/contrib/control_flow/python/cond_v2_test.py
+++ b/tensorflow/contrib/control_flow/python/cond_v2_test.py
@@ -22,11 +22,13 @@ from __future__ import print_function
 from tensorflow.contrib.control_flow.python import cond_v2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver
 
 
 class NewCondTest(test.TestCase):
@@ -109,6 +111,47 @@ class NewCondTest(test.TestCase):
       # d2[x]/dx2 = 0
       self.assertEqual(false_val, [0.0])
 
+  def testGradientOfDeserializedCond(self):
+    with ops.Graph().as_default():
+      pred = array_ops.placeholder(dtypes.bool, name="pred")
+      x = constant_op.constant(3.0, name="x")
+      ops.add_to_collection("x", x)
+
+      def true_fn():
+        return math_ops.pow(x, 3)
+
+      def false_fn():
+        return x
+
+      ops.add_to_collection("pred", pred)
+      cond = cond_v2.cond_v2(pred, true_fn, false_fn, name="cond")
+      for c in cond:
+        ops.add_to_collection("cond", c)
+      meta_graph = saver.export_meta_graph()
+
+    with ops.Graph().as_default() as g:
+      saver.import_meta_graph(meta_graph)
+      x = ops.get_collection("x")[0]
+      pred = ops.get_collection("pred")[0]
+      cond = ops.get_collection("cond")
+      cond_grad = gradients_impl.gradients(cond, [x], name="cond_grad")
+      cond_grad_grad = gradients_impl.gradients(
+          cond_grad, [x], name="cond_grad_grad")
+      with self.test_session(graph=g) as sess:
+        # d[x^3]/dx = 3x^2
+        true_val = sess.run(cond_grad, {pred: True})
+        self.assertEqual(true_val, [27.0])
+        # d[x]/dx = 1
+        false_val = sess.run(cond_grad, {pred: False})
+        self.assertEqual(false_val, [1.0])
+
+        true_val = sess.run(cond_grad_grad, {pred: True})
+        # d2[x^3]/dx2 = 6x
+        self.assertEqual(true_val, [18.0])
+        false_val = sess.run(cond_grad_grad, {pred: False})
+        # d2[x]/dx2 = 0
+        self.assertEqual(false_val, [0.0])
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From eccec6b44228a654a33aee656837c320c3d6a2f5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 14:10:08 -0700
Subject: [PATCH 389/610] Adding gradients for the LogMatrixDeterminant op +
 tests.

PiperOrigin-RevId: 199526349
---
 tensorflow/python/kernel_tests/linalg_grad_test.py |  6 ++++++
 tensorflow/python/ops/linalg_grad.py               | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 7d367a9275..6f401358a2 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -177,6 +177,12 @@ if __name__ == '__main__':
             MatrixUnaryFunctorGradientTest, 'MatrixDeterminantGradient', name,
             _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_determinant,
                                                dtype, shape))
+        _AddTest(
+            MatrixUnaryFunctorGradientTest, 'LogMatrixDeterminantGradient',
+            name,
+            _GetMatrixUnaryFunctorGradientTest(
+                lambda x: linalg_ops.log_matrix_determinant(x)[1],
+                dtype, shape))
 
   # Tests for gradients of matrix_solve_ls
   for dtype in np.float32, np.float64:
diff --git a/tensorflow/python/ops/linalg_grad.py b/tensorflow/python/ops/linalg_grad.py
index 3cbbf3412a..b6b98d5c86 100644
--- a/tensorflow/python/ops/linalg_grad.py
+++ b/tensorflow/python/ops/linalg_grad.py
@@ -55,6 +55,17 @@ def _MatrixDeterminantGrad(op, grad):
   return multipliers * a_adj_inv
 
 
+@ops.RegisterGradient("LogMatrixDeterminant")
+def _LogMatrixDeterminantGrad(op, _, grad_b):
+  """Gradient for LogMatrixDeterminant."""
+  a = op.inputs[0]
+  c = op.outputs[1]
+  a_adj_inv = linalg_ops.matrix_inverse(a, adjoint=True)
+  multipliers = array_ops.reshape(
+      grad_b, array_ops.concat([array_ops.shape(c), [1, 1]], 0))
+  return multipliers * a_adj_inv
+
+
 @ops.RegisterGradient("Cholesky")
 def _CholeskyGrad(op, grad):
   """Gradient for Cholesky."""
-- 
GitLab


From b6aeb3257fc4e9b1189c17517335bb7968557c30 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Wed, 6 Jun 2018 14:29:26 -0700
Subject: [PATCH 390/610] Fix runtime failure in executor_benchmark.

PiperOrigin-RevId: 199529330
---
 tensorflow/core/common_runtime/executor_test.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 8cb1567852..b24969613c 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -466,10 +466,10 @@ static void BM_FeedInputFetchOutput(int iters) {
   // z = x + y: x and y are provided as benchmark inputs.  z is the
   // output of the benchmark.  Conceptually, the caller is "a", the
   // benchmark is "b".
-  Node* x = test::graph::Recv(g, "x", "float", "a", 1, "b");
-  Node* y = test::graph::Recv(g, "y", "float", "a", 1, "b");
+  Node* x = test::graph::Recv(g, "x", "float", ALICE, 1, BOB);
+  Node* y = test::graph::Recv(g, "y", "float", ALICE, 1, BOB);
   Node* sum = test::graph::Add(g, x, y);
-  Node* z = test::graph::Send(g, sum, "z", "b", 1, "a");
+  Node* z = test::graph::Send(g, sum, "z", BOB, 1, ALICE);
   Tensor val(DT_FLOAT, TensorShape({}));
   val.scalar<float>()() = 3.14;
 #ifdef PLATFORM_GOOGLE
-- 
GitLab


From 2cce1a8504f53a5d8bdc08b6d0b5c036b672ca0e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 14:33:08 -0700
Subject: [PATCH 391/610] Use get*ArrayRegion instead of get*ArrayElements in
 TFlite JNI code.

Prefer get*ArrayRegion to avoid a JNI hop and (potentially) an extra
copy when copying Java inputs during interpreter execution.

PiperOrigin-RevId: 199530084
---
 .../lite/java/src/main/native/tensor_jni.cc   | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
index 005dca0253..9e9387da86 100644
--- a/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/contrib/lite/java/src/main/native/tensor_jni.cc
@@ -43,31 +43,27 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   }
   switch (type) {
     case kTfLiteFloat32: {
-      jfloatArray a = static_cast<jfloatArray>(array);
-      jfloat* values = env->GetFloatArrayElements(a, nullptr);
-      memcpy(dst, values, to_copy);
-      env->ReleaseFloatArrayElements(a, values, JNI_ABORT);
+      jfloatArray float_array = static_cast<jfloatArray>(array);
+      jfloat* float_dst = static_cast<jfloat*>(dst);
+      env->GetFloatArrayRegion(float_array, 0, num_elements, float_dst);
       return to_copy;
     }
     case kTfLiteInt32: {
-      jintArray a = static_cast<jintArray>(array);
-      jint* values = env->GetIntArrayElements(a, nullptr);
-      memcpy(dst, values, to_copy);
-      env->ReleaseIntArrayElements(a, values, JNI_ABORT);
+      jintArray int_array = static_cast<jintArray>(array);
+      jint* int_dst = static_cast<jint*>(dst);
+      env->GetIntArrayRegion(int_array, 0, num_elements, int_dst);
       return to_copy;
     }
     case kTfLiteInt64: {
-      jlongArray a = static_cast<jlongArray>(array);
-      jlong* values = env->GetLongArrayElements(a, nullptr);
-      memcpy(dst, values, to_copy);
-      env->ReleaseLongArrayElements(a, values, JNI_ABORT);
+      jlongArray long_array = static_cast<jlongArray>(array);
+      jlong* long_dst = static_cast<jlong*>(dst);
+      env->GetLongArrayRegion(long_array, 0, num_elements, long_dst);
       return to_copy;
     }
     case kTfLiteUInt8: {
-      jbyteArray a = static_cast<jbyteArray>(array);
-      jbyte* values = env->GetByteArrayElements(a, nullptr);
-      memcpy(dst, values, to_copy);
-      env->ReleaseByteArrayElements(a, values, JNI_ABORT);
+      jbyteArray byte_array = static_cast<jbyteArray>(array);
+      jbyte* byte_dst = static_cast<jbyte*>(dst);
+      env->GetByteArrayRegion(byte_array, 0, num_elements, byte_dst);
       return to_copy;
     }
     default: {
-- 
GitLab


From 4a2104ce30cd2a931ca3bae260d7394815f5dcae Mon Sep 17 00:00:00 2001
From: Max Galkin <maxgalkin@google.com>
Date: Wed, 6 Jun 2018 14:38:48 -0700
Subject: [PATCH 392/610] Estimate Squeeze cost in the same way as Reshape.

PiperOrigin-RevId: 199531069
---
 tensorflow/core/grappler/costs/op_level_cost_estimator.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
index b8e337582c..b994d26397 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator.cc
@@ -45,6 +45,7 @@ constexpr char kIdentityN[] = "IdentityN";
 constexpr char kRefIdentity[] = "RefIdentity";
 constexpr char kNoOp[] = "NoOp";
 constexpr char kReshape[] = "Reshape";
+constexpr char kSqueeze[] = "Squeeze";
 constexpr char kRecv[] = "_Recv";
 constexpr char kSend[] = "_Send";
 constexpr char kBatchMatMul[] = "BatchMatMul";
@@ -232,6 +233,7 @@ OpLevelCostEstimator::OpLevelCostEstimator() {
       {kStopGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kPreventGradient, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kReshape, wrap(&OpLevelCostEstimator::PredictIdentity)},
+      {kSqueeze, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kRecv, wrap(&OpLevelCostEstimator::PredictIdentity)},
       {kSend, wrap(&OpLevelCostEstimator::PredictIdentity)},
 
-- 
GitLab


From 65c05bc2ac19f51f7027e66350bc71652662125c Mon Sep 17 00:00:00 2001
From: Pete Warden <pete@petewarden.com>
Date: Wed, 6 Jun 2018 14:49:41 -0700
Subject: [PATCH 393/610] Removed unneeded file copy that was causing failure
 in Pi builds (#19789)

* Removed unneeded file copy that was causing failure in Pi builds

* Added back in Raspberry Pi targets lost during merge
---
 tensorflow/tools/ci_build/pi/build_raspberry_pi.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
index 4d1a30601e..b8bce57c87 100755
--- a/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
+++ b/tensorflow/tools/ci_build/pi/build_raspberry_pi.sh
@@ -102,6 +102,8 @@ bazel build -c opt ${PI_COPTS} \
   --copt=-fomit-frame-pointer --cpu=armeabi \
   --crosstool_top=@local_config_arm_compiler//:toolchain \
   --verbose_failures \
+  //tensorflow:libtensorflow.so \
+  //tensorflow:libtensorflow_framework.so \
   //tensorflow/tools/benchmark:benchmark_model \
   //tensorflow/tools/pip_package:build_pip_package
 
-- 
GitLab


From 7cb4b129543eb67b54a0c9373f904a699c338a1f Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Wed, 6 Jun 2018 14:51:37 -0700
Subject: [PATCH 394/610] Removed parts of numbers_test that caused
 asan/msan/tsan failure

PiperOrigin-RevId: 199533243
---
 tensorflow/core/lib/strings/numbers_test.cc | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tensorflow/core/lib/strings/numbers_test.cc b/tensorflow/core/lib/strings/numbers_test.cc
index 0f22dac262..5b595f9847 100644
--- a/tensorflow/core/lib/strings/numbers_test.cc
+++ b/tensorflow/core/lib/strings/numbers_test.cc
@@ -289,12 +289,9 @@ TEST(safe_strtof, Float) {
 
   EXPECT_FALSE(safe_strtof("-infinity is awesome", &result));
 
-  // Make sure we exit cleanly if the string is not terminated
+  // Make sure we exit cleanly if the string is too long
   char test_str[2 * kFastToBufferSize];
   for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
-  EXPECT_FALSE(safe_strtof(test_str, &result));
-
-  // Make sure we exit cleanly if the string is too long
   test_str[kFastToBufferSize + 1] = '\0';
   EXPECT_FALSE(safe_strtof(test_str, &result));
 
@@ -330,12 +327,9 @@ TEST(safe_strtod, Double) {
   EXPECT_EQ(0.1234567890123, result);
   EXPECT_FALSE(safe_strtod("0.1234567890123abc", &result));
 
-  // Make sure we exit cleanly if the string is not terminated
+  // Make sure we exit cleanly if the string is too long
   char test_str[2 * kFastToBufferSize];
   for (int i = 0; i < 2 * kFastToBufferSize; ++i) test_str[i] = 'a';
-  EXPECT_FALSE(safe_strtod(test_str, &result));
-
-  // Make sure we exit cleanly if the string is too long
   test_str[kFastToBufferSize + 1] = '\0';
   EXPECT_FALSE(safe_strtod(test_str, &result));
 
-- 
GitLab


From b1e5c6e0a1cb131d64cd3b35c744693c0099f349 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 6 Jun 2018 15:07:21 -0700
Subject: [PATCH 395/610] Remove _USE_C_API staging in tests now that the C API
 is enabled by default.

This is in preparation for removing the _USE_C_API toggle altogether.

PiperOrigin-RevId: 199536151
---
 tensorflow/python/ops/gradients_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 6891501ae1..d81c756f1c 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -83,7 +83,6 @@ def _OpsBetween(to_ops, from_ops):
   return between_ops
 
 
-@test_util.with_c_api
 class GradientsTest(test_util.TensorFlowTestCase):
 
   def _OpNames(self, op_list):
-- 
GitLab


From 617405d989a13839a585c82f9d09f03cbd080d0e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 15:24:19 -0700
Subject: [PATCH 396/610] [TF:XLA] Fix the control edges for ops without
 inputs/outputs passed to CompileSingleOp. Valid that all nodes of the graph
 are reachable from the source node at the beginning of
 FunctionalizeControlFlow.

PiperOrigin-RevId: 199539348
---
 .../tf2xla/functionalize_control_flow.cc      |  8 +++-
 tensorflow/compiler/tf2xla/xla_compiler.cc    |  1 +
 .../compiler/tf2xla/xla_compiler_test.cc      | 38 +++++++++++++++++++
 tensorflow/core/graph/control_flow.cc         | 11 +++++-
 tensorflow/core/graph/control_flow.h          |  6 ++-
 5 files changed, 60 insertions(+), 4 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 42585ad4d8..1438f6b48c 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -1438,7 +1438,13 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
   // connected to all source nodes in the graph. Many graphs violate this
   // invariant.
   std::vector<ControlFlowInfo> cf_info;
-  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info));
+  std::vector<string> unreachable_nodes;
+  TF_RETURN_IF_ERROR(BuildControlFlowInfo(graph, &cf_info, &unreachable_nodes));
+  if (!unreachable_nodes.empty()) {
+    return errors::InvalidArgument(
+        "The following nodes are unreachable from the source in the graph: ",
+        tensorflow::str_util::Join(unreachable_nodes, ", "));
+  }
 
   // Builds Frames, indexed by name.
   std::unordered_map<string, Frame> frames;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index a8bd199675..9c8e56a17e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -652,6 +652,7 @@ Status XlaCompiler::CompileSingleOp(
                         .Finalize(graph.get(), &node);
     TF_RETURN_IF_ERROR(status);
   }
+  FixupSourceAndSinkEdges(graph.get());
 
   return CompileGraph(options, name, std::move(graph), args, result);
 }
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index 5fbf4b952c..613230452b 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_constructor.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -1049,5 +1050,42 @@ TEST_F(XlaCompilerTest, NodeWithInvalidDataType) {
       << status.error_message();
 }
 
+TEST_F(XlaCompilerTest, SingleOpWithoutInputs) {
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  NodeDef no_op;
+  no_op.set_name("NoOp");
+  no_op.set_op("NoOp");
+  Status status;
+  graph->AddNode(no_op, &status);
+  TF_ASSERT_OK(status);
+
+  std::vector<XlaCompiler::Argument> args;
+  XlaCompiler compiler(DefaultOptions());
+  // No control edge linking NoOp with source/sink.
+  {
+    std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
+    CopyGraph(*graph, graph_copy.get());
+    XlaCompiler::CompilationResult result;
+    status = compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp",
+                                   std::move(graph_copy), args, &result);
+    ASSERT_FALSE(status.ok());
+    EXPECT_TRUE(str_util::StrContains(status.error_message(),
+                                      "The following nodes are unreachable "
+                                      "from the source in the graph: NoOp"))
+        << status.error_message();
+  }
+
+  // Fix control edges for NoOp.
+  {
+    std::unique_ptr<Graph> graph_copy(new Graph(OpRegistry::Global()));
+    CopyGraph(*graph, graph_copy.get());
+    EXPECT_TRUE(FixupSourceAndSinkEdges(graph_copy.get()));
+    XlaCompiler::CompilationResult result;
+    TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "NoOp",
+                                       std::move(graph_copy), args, &result));
+    EXPECT_EQ(0, result.resource_updates.size());
+  }
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/control_flow.cc b/tensorflow/core/graph/control_flow.cc
index 30ff19cd7e..fea25560d8 100644
--- a/tensorflow/core/graph/control_flow.cc
+++ b/tensorflow/core/graph/control_flow.cc
@@ -24,8 +24,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status BuildControlFlowInfo(const Graph* g,
-                            std::vector<ControlFlowInfo>* info) {
+Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
+                            std::vector<string>* unreachable_nodes) {
   info->clear();
   info->resize(g->num_node_ids());
 
@@ -114,6 +114,13 @@ Status BuildControlFlowInfo(const Graph* g,
       }
     }
   }
+  if (unreachable_nodes) {
+    for (const Node* node : g->op_nodes()) {
+      if (!parent_nodes[node->id()]) {
+        unreachable_nodes->push_back(node->name());
+      }
+    }
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/core/graph/control_flow.h b/tensorflow/core/graph/control_flow.h
index 79e2be0d4b..8605d57c14 100644
--- a/tensorflow/core/graph/control_flow.h
+++ b/tensorflow/core/graph/control_flow.h
@@ -33,11 +33,15 @@ struct ControlFlowInfo {
 // Clear and populate `info` with each node's frame and the level it belongs to.
 // We check the well-formedness of the graph: All inputs to a node must come
 // from the same frame and have the same "static" iteration level.
+// If `unreachable_nodes` is set, return names of nodes unreachable from the
+// source node. We cannot build ControlFlowInfo for such nodes. They might be
+// pruned later.
 //
 // NOTE(yuanbyu): For now, we require all sends/recvs have iteration level 0.
 // This essentially means there can't be multiple serial Nexts in an iteration,
 // which all sane front-ends should satisfy.
-Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info);
+Status BuildControlFlowInfo(const Graph* g, std::vector<ControlFlowInfo>* info,
+                            std::vector<string>* unreachable_nodes = nullptr);
 
 }  // namespace tensorflow
 
-- 
GitLab


From 64204dd0addea52368400eea6c67616c312b594d Mon Sep 17 00:00:00 2001
From: Karmel Allison <karmel@google.com>
Date: Wed, 6 Jun 2018 16:06:06 -0700
Subject: [PATCH 397/610] Allow SavedModelBuilder to use custom Savers, and
 pass custom Savers included in Estimator model functions through to the
 Builder when saving.

PiperOrigin-RevId: 199546645
---
 tensorflow/python/estimator/estimator.py      | 12 ++-
 tensorflow/python/estimator/estimator_test.py | 42 +++++++----
 tensorflow/python/saved_model/BUILD           |  1 +
 tensorflow/python/saved_model/builder_impl.py | 46 +++++++-----
 .../python/saved_model/saved_model_test.py    | 75 +++++++++++++++++++
 ...d_model.builder.-saved-model-builder.pbtxt |  4 +-
 6 files changed, 138 insertions(+), 42 deletions(-)

diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 4f57a4ef79..4be1af1e66 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -893,11 +893,14 @@ class Estimator(object):
             estimator_spec.scaffold.local_init_op or
             monitored_session.Scaffold.default_local_init_op())
 
-        saver_for_restore = estimator_spec.scaffold.saver or saver.Saver(
-            sharded=True)
+        # This saver will be used both for restoring variables now,
+        # and in saving out the metagraph below. This ensures that any
+        # Custom Savers stored with the Scaffold are passed through to the
+        # SavedModel for restore later.
+        graph_saver = estimator_spec.scaffold.saver or saver.Saver(sharded=True)
 
         try:
-          saver_for_restore.restore(session, checkpoint_path)
+          graph_saver.restore(session, checkpoint_path)
         except errors.NotFoundError as e:
           msg = ('Could not load all requested variables from the checkpoint. '
                  'Please make sure your model_fn does not expect variables '
@@ -918,7 +921,8 @@ class Estimator(object):
             assets_collection=ops.get_collection(
                 ops.GraphKeys.ASSET_FILEPATHS),
             strip_default_attrs=strip_default_attrs,
-            legacy_init_op=local_init_op)
+            legacy_init_op=local_init_op,
+            saver=graph_saver)
 
         if save_variables:
           builder.add_meta_graph_and_variables(
diff --git a/tensorflow/python/estimator/estimator_test.py b/tensorflow/python/estimator/estimator_test.py
index 9c0d0f7390..a43b820f32 100644
--- a/tensorflow/python/estimator/estimator_test.py
+++ b/tensorflow/python/estimator/estimator_test.py
@@ -100,6 +100,11 @@ def check_eventfile_for_keyword(keyword, dir_):
   return any(summaries_with_matching_keyword(keyword, dir_))
 
 
+def get_mock_saver():
+  real_saver = saver.Saver()
+  return test.mock.Mock(wraps=real_saver, saver_def=real_saver.saver_def)
+
+
 class EstimatorInheritanceConstraintTest(test.TestCase):
   """Tests that sub classes cannot override methods of Estimator."""
 
@@ -1295,9 +1300,7 @@ class EstimatorEvaluateTest(test.TestCase):
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
       variables.Variable(1., name='weight')
-      real_saver = saver.Saver()
-      self.mock_saver = test.mock.Mock(
-          wraps=real_saver, saver_def=real_saver.saver_def)
+      self.mock_saver = get_mock_saver()
       return model_fn_lib.EstimatorSpec(
           mode=mode,
           predictions=constant_op.constant([[1.]]),
@@ -1819,9 +1822,7 @@ class EstimatorPredictTest(test.TestCase):
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
       variables.Variable(1., name='weight')
-      real_saver = saver.Saver()
-      self.mock_saver = test.mock.Mock(
-          wraps=real_saver, saver_def=real_saver.saver_def)
+      self.mock_saver = get_mock_saver()
       return model_fn_lib.EstimatorSpec(
           mode=mode,
           predictions=constant_op.constant([[1.]]),
@@ -2315,8 +2316,8 @@ class EstimatorExportTest(test.TestCase):
         graph_ops = [x.name for x in graph.get_operations()]
         self.assertTrue('input_example_tensor' in graph_ops)
         self.assertTrue('ParseExample/ParseExample' in graph_ops)
-        # Note that the SavedModel builder replaced the Saver with a new one
-        self.assertTrue('save_1/LookupTableImportV2' in graph_ops)
+        # The original saver is used to restore variables
+        self.assertTrue('save/LookupTableImportV2' in graph_ops)
 
     # Clean up.
     gfile.DeleteRecursively(tmpdir)
@@ -2481,9 +2482,7 @@ class EstimatorExportTest(test.TestCase):
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
       variables.Variable(1., name='weight')
-      real_saver = saver.Saver()
-      self.mock_saver = test.mock.Mock(
-          wraps=real_saver, saver_def=real_saver.saver_def)
+      self.mock_saver = get_mock_saver()
       scores = constant_op.constant([3.])
       return model_fn_lib.EstimatorSpec(
           mode=mode,
@@ -2506,19 +2505,24 @@ class EstimatorExportTest(test.TestCase):
     est.export_savedmodel(export_dir_base, serving_input_receiver_fn)
 
     self.assertTrue(self.mock_saver.restore.called)
+    self.assertTrue(self.mock_saver.export_meta_graph.called)
+    self.assertTrue(self.mock_saver.save.called)
 
   def test_scaffold_is_used_for_saver_multiple_modes(self):
     tmpdir = tempfile.mkdtemp()
+    savers = {'predict_saver': None, 'train_saver': None}
 
     def _model_fn_scaffold(features, labels, mode):
       _, _ = features, labels
       variables.Variable(1., name='weight')
-      real_saver = saver.Saver()
-      self.mock_saver = test.mock.Mock(
-          wraps=real_saver, saver_def=real_saver.saver_def)
+
       scores = constant_op.constant([3.])
       if mode == model_fn_lib.ModeKeys.PREDICT:
-        scaffold = training.Scaffold(saver=self.mock_saver)
+        savers['predict_saver'] = get_mock_saver()
+        scaffold = training.Scaffold(saver=savers['predict_saver'])
+      elif mode == model_fn_lib.ModeKeys.TRAIN:
+        savers['train_saver'] = get_mock_saver()
+        scaffold = training.Scaffold(saver=savers['train_saver'])
       else:
         scaffold = training.Scaffold()
       return model_fn_lib.EstimatorSpec(
@@ -2542,7 +2546,13 @@ class EstimatorExportTest(test.TestCase):
         compat.as_bytes(tmpdir), compat.as_bytes('export'))
     est._export_all_saved_models(export_dir_base, input_receiver_fn_map)
 
-    self.assertTrue(self.mock_saver.restore.called)
+    self.assertTrue(savers['train_saver'].restore.called)
+    self.assertEqual(savers['train_saver'].export_meta_graph.call_count, 1)
+    self.assertEqual(savers['train_saver'].save.call_count, 1)
+
+    self.assertTrue(savers['predict_saver'].restore.called)
+    self.assertEqual(savers['predict_saver'].export_meta_graph.call_count, 1)
+    self.assertEqual(savers['predict_saver'].save.call_count, 0)
 
   def test_scaffold_is_used_for_local_init(self):
     tmpdir = tempfile.mkdtemp()
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 2609a5d222..81786fbf43 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -149,6 +149,7 @@ py_test(
         "//tensorflow/python:saver_test_utils",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 24a13c0f33..e58be804c2 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -270,6 +270,18 @@ class SavedModelBuilder(object):
 
     self._add_train_op(train_op)
 
+  def _maybe_create_saver(self, saver=None):
+    """Creates a sharded saver if one does not already exist."""
+    if not saver:
+      # Initialize a saver to generate a sharded output for all saveables in the
+      # current scope.
+      saver = tf_saver.Saver(
+          variables._all_saveable_objects(),  # pylint: disable=protected-access
+          sharded=True,
+          write_version=saver_pb2.SaverDef.V2,
+          allow_empty=True)
+    return saver
+
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
@@ -277,7 +289,8 @@ class SavedModelBuilder(object):
                      legacy_init_op=None,
                      clear_devices=False,
                      main_op=None,
-                     strip_default_attrs=False):
+                     strip_default_attrs=False,
+                     saver=None):
     # pylint: disable=line-too-long
     """Adds the current meta graph to the SavedModel.
 
@@ -302,6 +315,9 @@ class SavedModelBuilder(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      saver: An instance of tf.train.Saver that will be used to export the
+        metagraph. If None, a sharded Saver that restores all variables will
+        be used.
 
     Raises:
       AssertionError: If the variables for the SavedModel have not been saved
@@ -320,18 +336,11 @@ class SavedModelBuilder(object):
     # Add assets and ops
     self._add_collections(assets_collection, legacy_init_op, main_op, None)
 
-    # Initialize a saver to generate a sharded output for all saveables in the
-    # current scope.
-    saver = tf_saver.Saver(
-        variables._all_saveable_objects(),  # pylint: disable=protected-access
-        sharded=True,
-        write_version=saver_pb2.SaverDef.V2,
-        allow_empty=True)
+    saver = self._maybe_create_saver(saver)
 
     # The graph almost certainly previously contained at least one Saver, and
     # possibly several (e.g. one for loading a pretrained embedding, and another
-    # for the model weights).  However, a *new* Saver was just created that
-    # includes all of the variables.  Removing the preexisting ones was the
+    # for the model weights).  Removing the preexisting ones was the
     # motivation for the clear_extraneous_savers option, but it turns out that
     # there are edge cases where that option breaks the graph.  Until that is
     # resolved, we just leave the option set to False for now.
@@ -350,7 +359,8 @@ class SavedModelBuilder(object):
                                    legacy_init_op=None,
                                    clear_devices=False,
                                    main_op=None,
-                                   strip_default_attrs=False):
+                                   strip_default_attrs=False,
+                                   saver=None):
     # pylint: disable=line-too-long
     """Adds the current meta graph to the SavedModel and saves variables.
 
@@ -377,6 +387,9 @@ class SavedModelBuilder(object):
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      saver: An instance of tf.train.Saver that will be used to export the
+        metagraph and save variables. If None, a sharded Saver that restores
+        all variables will be used.
 
     """
     # pylint: enable=line-too-long
@@ -403,13 +416,7 @@ class SavedModelBuilder(object):
         compat.as_text(variables_dir),
         compat.as_text(constants.VARIABLES_FILENAME))
 
-    # Initialize a saver to generate a sharded output for all saveables in the
-    # current scope.
-    saver = tf_saver.Saver(
-        variables._all_saveable_objects(),  # pylint: disable=protected-access
-        sharded=True,
-        write_version=saver_pb2.SaverDef.V2,
-        allow_empty=True)
+    saver = self._maybe_create_saver(saver)
 
     # Save the variables. Also, disable writing the checkpoint state proto. The
     # file is not used during SavedModel loading. In addition, since a
@@ -421,8 +428,7 @@ class SavedModelBuilder(object):
 
     # The graph almost certainly previously contained at least one Saver, and
     # possibly several (e.g. one for loading a pretrained embedding, and another
-    # for the model weights).  However, a *new* Saver was just created that
-    # includes all of the variables.  Removing the preexisting ones was the
+    # for the model weights).  Removing the preexisting ones was the
     # motivation for the clear_extraneous_savers option, but it turns out that
     # there are edge cases where that option breaks the graph.  Until that is
     # resolved, we just leave the option set to False for now.
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 7302c77ad5..effb38283b 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.saved_model import main_op
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import saver_test_utils
+from tensorflow.python.training import training
 from tensorflow.python.util import compat
 
 SAVED_MODEL_PATH = ("cc/saved_model/testdata/half_plus_two/00000123")
@@ -1122,6 +1123,80 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(b"k1", v1.keys().eval())
       self.assertEqual(3.0, v1.values().eval())
 
+  def testCustomSaver(self):
+    export_dir = self._get_export_dir("test_custom_saver")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      variables.Variable(1, name="v1")
+      sess.run(variables.global_variables_initializer())
+      custom_saver = training.Saver(name="my_saver")
+      builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph=graph) as sess:
+        saved_graph = loader.load(sess, ["tag"], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue("my_saver/restore_all" in graph_ops)
+        self.assertFalse("save/restore_all" in graph_ops)
+        self.assertEqual(
+            saved_graph.saver_def.restore_op_name, "my_saver/restore_all")
+
+  def testNoCustomSaver(self):
+    export_dir = self._get_export_dir("test_no_custom_saver")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      variables.Variable(1, name="v1")
+      sess.run(variables.global_variables_initializer())
+      training.Saver(name="my_saver")
+      builder.add_meta_graph_and_variables(sess, ["tag"])
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with ops.Graph().as_default() as graph:
+      with self.test_session(graph=graph) as sess:
+        saved_graph = loader.load(sess, ["tag"], export_dir)
+        graph_ops = [x.name for x in graph.get_operations()]
+        self.assertTrue("my_saver/restore_all" in graph_ops)
+        self.assertTrue("save/restore_all" in graph_ops)
+        self.assertEqual(
+            saved_graph.saver_def.restore_op_name, "save/restore_all")
+
+  def testMultipleCustomSavers(self):
+    export_dir = self._get_export_dir("test_multiple_custom_savers")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      variables.Variable(1, name="v1")
+      sess.run(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(sess, ["tag_0"])
+
+      saver_1 = training.Saver()
+      builder.add_meta_graph(["tag_1"], saver=saver_1)
+
+      saver_2 = training.Saver()
+      builder.add_meta_graph(["tag_2"], saver=saver_2)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    def _validate_custom_saver(tag_name, saver_name):
+      with ops.Graph().as_default() as graph:
+        with self.test_session(graph=graph) as sess:
+          saved_graph = loader.load(sess, [tag_name], export_dir)
+          self.assertEqual(
+              saved_graph.saver_def.restore_op_name,
+              saver_name)
+
+    _validate_custom_saver("tag_0", "save/restore_all")
+    _validate_custom_saver("tag_1", "save_1/restore_all")
+    _validate_custom_saver("tag_2", "save_2/restore_all")
+
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index ca8e5884b1..83bd703540 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -8,11 +8,11 @@ tf_class {
   }
   member_method {
     name: "add_meta_graph"
-    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "add_meta_graph_and_variables"
-    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'self\', \'sess\', \'tags\', \'signature_def_map\', \'assets_collection\', \'legacy_init_op\', \'clear_devices\', \'main_op\', \'strip_default_attrs\', \'saver\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "save"
-- 
GitLab


From c4a3763539dbdb2ee08cca99074d78ce3b6d54de Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Wed, 6 Jun 2018 16:18:44 -0700
Subject: [PATCH 398/610] quantize_weights flag for tflite_convert.

PiperOrigin-RevId: 199549093
---
 tensorflow/contrib/lite/python/convert.py     | 14 +++++---
 tensorflow/contrib/lite/python/lite.py        |  8 ++++-
 tensorflow/contrib/lite/python/lite_test.py   | 32 +++++++++++++++++++
 .../contrib/lite/python/tflite_convert.py     |  8 +++++
 4 files changed, 57 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 0819475240..63c6105b3b 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -123,7 +123,8 @@ def toco_convert(input_data,
                  drop_control_dependency=True,
                  reorder_across_fake_quant=False,
                  allow_custom_ops=False,
-                 change_concat_input_ranges=False):
+                 change_concat_input_ranges=False,
+                 quantize_weights=False):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -158,14 +159,18 @@ def toco_convert(input_data,
       nodes is preventing graph transformations necessary to convert the graph.
       Results in a graph that differs from the quantized training graph,
       potentially causing differing arithmetic behavior. (default False)
-    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
-      inputs and outputs of the concat operator for quantized models. Changes
-      the ranges of concat operator overlap when true. (default False)
     allow_custom_ops: Boolean indicating whether to allow custom operations.
       When false any unknown operation is an error. When true, custom ops are
       created for any op that is unknown. The developer will need to provide
       these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
+    change_concat_input_ranges: Boolean to change behavior of min/max ranges for
+      inputs and outputs of the concat operator for quantized models. Changes
+      the ranges of concat operator overlap when true. (default False)
+    quantize_weights: Boolean indicating whether to store weights as quantized
+      weights followed by dequantize operations. Computation is still done in
+      float, but reduces model size (at the cost of accuracy and latency).
+      (default False)
 
   Returns:
     The converted data. For example if TFLite was the destination, then
@@ -185,6 +190,7 @@ def toco_convert(input_data,
   toco.drop_control_dependency = drop_control_dependency
   toco.reorder_across_fake_quant = reorder_across_fake_quant
   toco.allow_custom_ops = allow_custom_ops
+  toco.quantize_weights = quantize_weights
   if default_ranges_stats:
     toco.default_ranges_min = default_ranges_stats[0]
     toco.default_ranges_max = default_ranges_stats[1]
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 0ccd6675db..253e3f72b1 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -92,6 +92,10 @@ class TocoConverter(object):
       created for any op that is unknown. The developer will need to provide
       these to the TensorFlow Lite runtime with a custom resolver.
       (default False)
+    quantize_weights: Boolean indicating whether to store weights as quantized
+      weights followed by dequantize operations. Computation is still done in
+      float, but reduces model size (at the cost of accuracy and latency).
+      (default False)
 
   Example usage:
 
@@ -133,6 +137,7 @@ class TocoConverter(object):
     self.reorder_across_fake_quant = False
     self.change_concat_input_ranges = False
     self.allow_custom_ops = False
+    self.quantize_weights = False
 
   @classmethod
   def from_session(cls, sess, input_tensors, output_tensors):
@@ -302,7 +307,8 @@ class TocoConverter(object):
         drop_control_dependency=self.drop_control_dependency,
         reorder_across_fake_quant=self.reorder_across_fake_quant,
         change_concat_input_ranges=self.change_concat_input_ranges,
-        allow_custom_ops=self.allow_custom_ops)
+        allow_custom_ops=self.allow_custom_ops,
+        quantize_weights=self.quantize_weights)
     return result
 
   def get_input_arrays(self):
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index 019a3a5f69..bbb00021f9 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -25,9 +25,11 @@ from tensorflow.contrib.lite.python import lite
 from tensorflow.contrib.lite.python import lite_constants
 from tensorflow.contrib.lite.python.interpreter import Interpreter
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -291,6 +293,36 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertTrue(output_details[0]['quantization'][0] > 0)  # scale
 
+  def testQuantizeWeights(self):
+    np.random.seed(0)
+    # We need the tensor to have more than 1024 elements for quantize_weights
+    # to kick in. Thus, the [33, 33] shape.
+    in_tensor_1 = array_ops.placeholder(
+        shape=[33, 33], dtype=dtypes.float32, name='inputA')
+    in_tensor_2 = constant_op.constant(
+        np.random.uniform(low=-10., high=10., size=(33, 33)),
+        shape=[33, 33],
+        dtype=dtypes.float32,
+        name='inputB')
+    out_tensor = math_ops.matmul(in_tensor_1, in_tensor_2, name='output')
+    sess = session.Session()
+
+    # Convert float model.
+    float_converter = lite.TocoConverter.from_session(sess, [in_tensor_1],
+                                                      [out_tensor])
+    float_tflite = float_converter.convert()
+    self.assertTrue(float_tflite)
+
+    # Convert quantized weights model.
+    quantized_weights_converter = lite.TocoConverter.from_session(
+        sess, [in_tensor_1], [out_tensor])
+    quantized_weights_converter.quantize_weights = True
+    quantized_weights_tflite = quantized_weights_converter.convert()
+    self.assertTrue(quantized_weights_tflite)
+
+    # Ensure that the quantized weights tflite model is smaller.
+    self.assertTrue(len(quantized_weights_tflite) < len(float_tflite))
+
 
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 6d77626a4b..2b7ad29a27 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -128,6 +128,8 @@ def _convert_model(flags):
     converter.change_concat_input_ranges = flags.change_concat_input_ranges
   if flags.allow_custom_ops:
     converter.allow_custom_ops = flags.allow_custom_ops
+  if flags.quantize_weights:
+    converter.quantize_weights = flags.quantize_weights
 
   # Convert model.
   output_data = converter.convert()
@@ -282,6 +284,12 @@ def run_main(_):
       help=("Default value for max bound of min/max range values used for all "
             "arrays without a specified range, Intended for experimenting with "
             "quantization via \"dummy quantization\". (default None)"))
+  parser.add_argument(
+      "--quantize_weights",
+      type=bool,
+      help=("Store float weights as quantized weights followed by dequantize "
+            "operations. Inference is still done in FLOAT, but reduces model "
+            "size (at the cost of accuracy and latency)."))
 
   # Graph manipulation flags.
   parser.add_argument(
-- 
GitLab


From 032f804a2feca8995185a5fbb9dbc62d5d8df48e Mon Sep 17 00:00:00 2001
From: Yao Zhang <yaozhang@google.com>
Date: Wed, 6 Jun 2018 16:45:55 -0700
Subject: [PATCH 399/610] Add support for dilation. This is previously missed
 and would result in incorrect dilation values for dilated convoluations.

PiperOrigin-RevId: 199554005
---
 .../core/grappler/optimizers/layout_optimizer.cc  |  8 ++++++++
 .../grappler/optimizers/layout_optimizer_test.cc  | 15 ++++++++++-----
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index e08ab1eb67..3251e7cb10 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -499,6 +499,7 @@ class NodeProcessor : public GraphProcessor {
       UpdateAttrDataFormat();
       UpdateAttrKSize();
       UpdateAttrStrides();
+      UpdateAttrDilations();
       UpdateAttrShape();
       TF_RETURN_IF_ERROR(AddLayoutTransposeToInputs());
       TF_RETURN_IF_ERROR(AddLayoutTransposeToOutputs());
@@ -742,6 +743,13 @@ class NodeProcessor : public GraphProcessor {
     }
   }
 
+  void UpdateAttrDilations() {
+    if (node_->attr().find("dilations") != node_->attr().end()) {
+      auto list = node_->mutable_attr()->at("dilations").mutable_list();
+      UpdateTuple(list);
+    }
+  }
+
   void UpdateAttrDataFormat() {
     if (node_->attr().find("data_format") != node_->attr().end()) {
       if (node_->attr().at("data_format").s().compare("NHWC") == 0) {
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
index dad49cd74f..20e47c1b26 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer_test.cc
@@ -87,12 +87,13 @@ class LayoutOptimizerTest : public GrapplerTest {
 
   Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
                                    int filter_size, const string& padding) {
-    return SimpleConv2DBackpropInput(s, input_size, filter_size, padding, true);
+    return SimpleConv2DBackpropInput(s, input_size, filter_size, padding, true,
+                                     true);
   }
 
   Output SimpleConv2DBackpropInput(tensorflow::Scope* s, int input_size,
                                    int filter_size, const string& padding,
-                                   bool const_input_size) {
+                                   bool const_input_size, bool dilated) {
     int batch_size = 128;
     int input_height = input_size;
     int input_width = input_size;
@@ -123,14 +124,18 @@ class LayoutOptimizerTest : public GrapplerTest {
     Output conv_backprop_input;
     Output input_sizes_i =
         ops::Identity(s->WithOpName("InputSizesIdentity"), input_sizes);
+    ops::Conv2DBackpropInput::Attrs attrs;
+    if (dilated) {
+      attrs = attrs.Dilations({1, 2, 2, 1});
+    }
     if (const_input_size) {
       conv_backprop_input = ops::Conv2DBackpropInput(
           s->WithOpName("Conv2DBackpropInput"), input_sizes, filter, output,
-          {1, stride, stride, 1}, padding);
+          {1, stride, stride, 1}, padding, attrs);
     } else {
       conv_backprop_input = ops::Conv2DBackpropInput(
           s->WithOpName("Conv2DBackpropInput"), input_sizes_i, filter, output,
-          {1, stride, stride, 1}, padding);
+          {1, stride, stride, 1}, padding, attrs);
     }
     return conv_backprop_input;
   }
@@ -216,7 +221,7 @@ TEST_F(LayoutOptimizerTest, Conv2DBackpropInput) {
 
 TEST_F(LayoutOptimizerTest, Conv2DBackpropInputNonConstInputSizes) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
-  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", false);
+  auto conv = SimpleConv2DBackpropInput(&s, 7, 2, "SAME", false, false);
   Output fetch = ops::Identity(s.WithOpName("Fetch"), {conv});
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-- 
GitLab


From 40a5601d20e2acd2e1301d7a2db376e66ff959ef Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Wed, 6 Jun 2018 16:56:37 -0700
Subject: [PATCH 400/610] Updated documentation relating to quantized input
 stats.

PiperOrigin-RevId: 199556088
---
 tensorflow/contrib/lite/python/convert.py        | 7 +++----
 tensorflow/contrib/lite/toco/g3doc/python_api.md | 3 ++-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 63c6105b3b..08f3f8bf32 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -144,10 +144,9 @@ def toco_convert(input_data,
       `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
-    quantized_input_stats: Dict of strings representing input tensor names
-      mapped to tuple of integers representing the mean and standard deviation
-      of the training data (e.g., {"foo" : (0., 1.)}). Only need if
-      `inference_type` is `QUANTIZED_UINT8`. (default None)
+    quantized_input_stats: List of tuples of integers representing the mean and
+      standard deviation. Each tuple maps to the corresponding input tensor.
+      Only need if `inference_type` is `QUANTIZED_UINT8`. (default None)
     default_ranges_stats: Tuple of integers representing (min, max) range values
       for all arrays without a specified range. Intended for experimenting with
       quantization via "dummy quantization". (default None)
diff --git a/tensorflow/contrib/lite/toco/g3doc/python_api.md b/tensorflow/contrib/lite/toco/g3doc/python_api.md
index 5071361bfd..a7841a6855 100644
--- a/tensorflow/contrib/lite/toco/g3doc/python_api.md
+++ b/tensorflow/contrib/lite/toco/g3doc/python_api.md
@@ -138,7 +138,8 @@ out = tf.fake_quant_with_min_max_args(val, min=0., max=1., name="output")
 with tf.Session() as sess:
   converter = tf.contrib.lite.TocoConverter.from_session(sess, [img], [out])
   converter.inference_type = tf.contrib.lite.constants.QUANTIZED_UINT8
-  converter.quantized_input_stats = {"img" : (0., 1.)}  # mean, std_dev
+  input_arrays = converter.get_input_arrays()
+  converter.quantized_input_stats = {input_arrays[0] : (0., 1.)}  # mean, std_dev
   tflite_model = converter.convert()
   open("converted_model.tflite", "wb").write(tflite_model)
 ```
-- 
GitLab


From 60bd73a0228fc025bc9868d2a8d2404a0676dfd2 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Wed, 6 Jun 2018 16:56:59 -0700
Subject: [PATCH 401/610] Make the LLVM IR GEMM tile size configurable; NFC

PiperOrigin-RevId: 199556158
---
 .../compiler/xla/service/cpu/cpu_options.cc   | 39 +++++++++++++++++++
 .../compiler/xla/service/cpu/cpu_options.h    |  2 +
 .../xla/service/cpu/dot_op_emitter.cc         | 38 ++++++++++++++----
 .../compiler/xla/service/cpu/dot_op_emitter.h |  7 ++++
 4 files changed, 79 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index e75fcb6bc9..3ed7876715 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/cpu_options.h"
 
 #include "tensorflow/core/lib/strings/numbers.h"
+#include "tensorflow/core/lib/strings/str_util.h"
 
 namespace {
 
@@ -24,6 +25,7 @@ const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaEnableExperimentalLlvmIrGemm =
     "xla_enable_experimental_llvm_ir_gemm";
+const char* const kLlvmIrGemmTileSize = "xla_llvm_ir_gemm_tile_size";
 
 }  // namespace
 
@@ -62,6 +64,43 @@ bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config) {
   return extra_options_map.count(kXlaEnableExperimentalLlvmIrGemm) > 0;
 }
 
+static tensorflow::StringPiece RemoveSuffix(tensorflow::StringPiece str,
+                                            tensorflow::StringPiece suffix) {
+  CHECK_GE(str.size(), suffix.size());
+  CHECK_EQ(str.substr(str.size() - suffix.size()), suffix);
+  return str.substr(0, str.size() - suffix.size());
+}
+
+tensorflow::gtl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
+    const HloModuleConfig& config) {
+  const auto& extra_options_map =
+      config.debug_options().xla_backend_extra_options();
+  auto it = extra_options_map.find(kLlvmIrGemmTileSize);
+  if (it == extra_options_map.end()) {
+    return tensorflow::gtl::nullopt;
+  }
+
+  std::vector<string> tile_components =
+      tensorflow::str_util::Split(it->second, ':');
+  CHECK_EQ(tile_components.size(), 3);
+
+  int64 tile_size_m;
+  int64 tile_size_k;
+  int64 tile_size_n_in_vector_width;
+
+  CHECK(tensorflow::strings::safe_strto64(tile_components[0], &tile_size_m));
+  CHECK(tensorflow::strings::safe_strto64(tile_components[1], &tile_size_k));
+
+  tensorflow::StringPiece tile_size_n_in_vector_width_str =
+      RemoveSuffix(tile_components[2], "*vectwidth");
+
+  CHECK(tensorflow::strings::safe_strto64(tile_size_n_in_vector_width_str,
+                                          &tile_size_n_in_vector_width));
+
+  return std::tuple<int64, int64, int64>(tile_size_m, tile_size_k,
+                                         tile_size_n_in_vector_width);
+}
+
 }  // namespace options
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.h b/tensorflow/compiler/xla/service/cpu/cpu_options.h
index 106dfbbc62..429b9e16cb 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.h
@@ -29,6 +29,8 @@ bool VectorizedReduceDisabled(const HloModuleConfig& config);
 bool EnableExperimentalLlvmIrGemm(const HloModuleConfig& config);
 tensorflow::gtl::optional<int64> LlvmIrGemvTilingFactor(
     const HloModuleConfig& config);
+tensorflow::gtl::optional<std::tuple<int64, int64, int64>> LlvmIrGemmTileSize(
+    const HloModuleConfig& config);
 
 }  // namespace options
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index c5c95a3c2c..cda623f8e8 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -665,6 +665,10 @@ class MatrixMatrixBlockPanelEmitter {
   // the largest vector register we will use).  This can be larger than the
   // largest vector register supported by the machine -- LLVM will legalize
   // these large vector widths into legally sized vectors.
+  //
+  // `max_vector_count` is the maximum number of vectors of size
+  // `max_vectorization_width` that we will attempt to process at once.
+  //
   // `min_vectorization_width` is the smallest vector width the emitter will use
   // -- below that it will devolve to using a scalar loop.
   //
@@ -674,12 +678,13 @@ class MatrixMatrixBlockPanelEmitter {
   class Config {
    public:
     explicit Config(PrimitiveType scalar_type, Dimensions dims,
-                    int64 max_vectorization_width,
+                    int64 max_vectorization_width, int64 max_vector_count,
                     int64 min_vectorization_width, int64 tile_size_m,
                     int64 tile_size_k)
         : scalar_type_(scalar_type),
           dims_(dims),
           max_vectorization_width_(max_vectorization_width),
+          max_vector_count_(max_vector_count),
           min_vectorization_width_(min_vectorization_width),
           tile_size_m_(tile_size_m),
           tile_size_k_(tile_size_k) {}
@@ -694,6 +699,7 @@ class MatrixMatrixBlockPanelEmitter {
     PrimitiveType scalar_type() const { return scalar_type_; }
     Dimensions dims() const { return dims_; }
     int64 max_vectorization_width() const { return max_vectorization_width_; }
+    int64 max_vector_count() const { return max_vector_count_; }
     int64 min_vectorization_width() const { return min_vectorization_width_; }
 
     int64 tile_size_m() const { return tile_size_m_; }
@@ -703,6 +709,7 @@ class MatrixMatrixBlockPanelEmitter {
     PrimitiveType scalar_type_;
     Dimensions dims_;
     int64 max_vectorization_width_;
+    int64 max_vector_count_;
     int64 min_vectorization_width_;
     int64 tile_size_m_;
     int64 tile_size_k_;
@@ -721,8 +728,10 @@ class MatrixMatrixBlockPanelEmitter {
         ksl_(ir_builder_) {
     CHECK(max_vectorization_width() > 0 &&
           IsPowerOfTwo(static_cast<uint64>(max_vectorization_width())));
+    CHECK_GT(max_vector_count(), 0);
     CHECK(min_vectorization_width() > 0 &&
           IsPowerOfTwo(static_cast<uint64>(min_vectorization_width())));
+    CHECK_GE(max_vectorization_width(), min_vectorization_width());
     CHECK_GT(tile_size_k(), 0);
   }
 
@@ -759,6 +768,7 @@ class MatrixMatrixBlockPanelEmitter {
   int64 max_vectorization_width() const {
     return config().max_vectorization_width();
   }
+  int64 max_vector_count() const { return config().max_vector_count(); }
   int64 min_vectorization_width() const {
     return config().min_vectorization_width();
   }
@@ -784,7 +794,10 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
   // the largest remaining extent that is divisible by max_vectorization_width /
   // 2 etc.
 
-  int64 current_vectorization_width = max_vectorization_width();
+  int64 current_vectorization_width =
+      max_vector_count() * max_vectorization_width();
+  int64 current_vector_count = max_vector_count();
+
   int64 n_start = 0;
   while (n_start != dims().n() &&
          current_vectorization_width >= min_vectorization_width()) {
@@ -795,7 +808,13 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
       HandleResiduesOnK(&vsl, GetInt64(n_start), GetInt64(n_end));
       n_start = n_end;
     }
-    current_vectorization_width /= 2;
+    if (current_vector_count == 1) {
+      current_vectorization_width /= 2;
+    } else {
+      current_vector_count--;
+      current_vectorization_width =
+          current_vector_count * max_vectorization_width();
+    }
   }
 
   if (n_start != dims().n()) {
@@ -1019,16 +1038,21 @@ bool DotOpEmitter::EmitExperimentalGebpDotIfEnabled(
       target, ir_builder_->getInt8(0), size_bytes,
       target_machine_features_.minimum_alignment_for_allocation(size_bytes));
 
-  int64 max_vector_width =
+  int64 max_target_vector_width =
       target_machine_features_.vector_register_num_elements(
           *ir_builder_->GetInsertBlock()->getParent(), primitive_type);
 
+  int64 tile_size_m, tile_size_k, tile_size_n_in_vector_width;
+  std::tie(tile_size_m, tile_size_k, tile_size_n_in_vector_width) =
+      GetGemmTileSize();
+
   MatrixMatrixBlockPanelEmitter::Config config(
       /*scalar_type=*/primitive_type,
       MatrixMatrixBlockPanelEmitter::Dimensions{/*m=*/m, /*k=*/k, /*n=*/n},
-      /*max_vectorization_width=*/max_vector_width,
-      /*min_vectorization_width=*/std::min<int64>(4, max_vector_width),
-      /*tile_size_m=*/3, /*tile_size_k=*/5);
+      /*max_vectorization_width=*/max_target_vector_width,
+      /*max_vector_count=*/tile_size_n_in_vector_width,
+      /*min_vectorization_width=*/std::min<int64>(4, max_target_vector_width),
+      /*tile_size_m=*/tile_size_m, /*tile_size_k=*/tile_size_k);
 
   VLOG(2) << "Emitting GEBP kernel in LLVM IR with config "
           << config.GetCacheKey();
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index d88ccea0db..2effb7fc36 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -143,6 +143,13 @@ class DotOpEmitter {
         .value_or(kDefaultTilingFactor);
   }
 
+  std::tuple<int64, int64, int64> GetGemmTileSize() const {
+    const std::tuple<int64, int64, int64> kDefaultTileSize =
+        std::tuple<int64, int64, int64>(3, 5, 1);
+    return options::LlvmIrGemmTileSize(hlo_module_config_)
+        .value_or(kDefaultTileSize);
+  }
+
   // Returns true if we should use an experimental implementation of GEMM
   // (general matrix matrix multiplication) if possible.
   bool EnableExperimentalLlvmIrGemm() const {
-- 
GitLab


From 4a1889c0da16132da78805c3ea6790b18efe8f6d Mon Sep 17 00:00:00 2001
From: Tatiana Shpeisman <shpeisman@google.com>
Date: Wed, 6 Jun 2018 17:20:23 -0700
Subject: [PATCH 402/610] Code cleanup: use absl::string_view to pass
 string-like objects.

PiperOrigin-RevId: 199559525
---
 tensorflow/core/kernels/mkl_input_conversion_op.cc | 10 +++++-----
 tensorflow/core/util/mkl_util.h                    | 11 ++++-------
 2 files changed, 9 insertions(+), 12 deletions(-)

diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index cda1402b03..663228722b 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -439,11 +439,11 @@ class MklInputConversionOp : public OpKernel {
                    tensor_out, &net);
       if(!reordered) {
         // This is the case that the TF tensor has the same shape and format of
-        // mkl tensor. However, tf_tensor can not be simply forwarded to the output
-        // tensor since mkl data tensor is always one dimensional tensor. 
-        // Tensor::CopyFrom shares the buffer of the other tensor while set its shape
-        // to the other tensor. 
-        tensor_out->CopyFrom(*tf_tensor, tensor_out->shape());
+        // mkl tensor. However, tf_tensor can not be simply forwarded to the
+        // output tensor since mkl data tensor is always one dimensional tensor.
+        // Tensor::CopyFrom shares the buffer of the other tensor while set its
+        // shape to the other tensor.
+        CHECK(tensor_out->CopyFrom(*tf_tensor, tensor_out->shape()));
       }
       else  
         stream(stream::kind::eager).submit(net).wait();
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 8105121e7c..8a3ece7b8c 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1813,10 +1813,7 @@ class FactoryKeyCreator {
 
   ~FactoryKeyCreator() {}
 
-  void AddAsKey(const string &str) {
-    auto buffer = reinterpret_cast<const char *>(str.c_str());
-    Append(buffer, str.length());
-  }
+  void AddAsKey(const string& str) { Append(str); }
 
   void AddAsKey(const mkldnn::memory::dims &dims) {
     for (unsigned int i = 0; i < dims.size(); i++) {
@@ -1827,7 +1824,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(buffer, sizeof(T));
+    Append(absl::string_view(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1838,8 +1835,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(const char* data, int len) {
-    key_.append(data, len);
+  void Append(absl::string_view s) {
+    key_.append(string(s));
     key_.append(1, delimiter);
   }
 };
-- 
GitLab


From 068255cc07be8edd6b2b0d36b5dfa2f7959e19bc Mon Sep 17 00:00:00 2001
From: Yuefeng Zhou <yuefengz@google.com>
Date: Wed, 6 Jun 2018 17:21:12 -0700
Subject: [PATCH 403/610] Run cross_tower_ops_test with test sharding.

PiperOrigin-RevId: 199559611
---
 tensorflow/contrib/distribute/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 1f43a6eed5..1e1d503744 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -508,6 +508,7 @@ cuda_py_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
+    shard_count = 15,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
-- 
GitLab


From 86cfb0b27ea07f08d43e9a622da2baf14aa387a0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 17:26:41 -0700
Subject: [PATCH 404/610] Make the noop returned by tpu.replicate() trigger TPU
 computations.

PiperOrigin-RevId: 199560313
---
 tensorflow/contrib/tpu/python/tpu/tpu.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index 71a5012691..1c482950e6 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -623,6 +623,11 @@ def split_compile_and_replicate(computation,
 
       vscope.set_use_resource(saved_use_resource)
 
+    # If the computation returns `None`, add `no_op` here so that when user
+    # fetches `no_op` returned by this function, the TPUExecute node will be
+    # triggered.
+    if outputs is None:
+      outputs = (control_flow_ops.no_op(),)
     # If the computation only returned one value, makes it a tuple.
     if not isinstance(outputs, (list, tuple)):
       outputs = (outputs,)
-- 
GitLab


From 68d1fc41f9c9908fc8f849cfa0ffa56d9f651f6a Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 6 Jun 2018 17:31:39 -0700
Subject: [PATCH 405/610] Fix taking higher-order derivatives of cond_v2.

The problem:
When we build the N-th derivative of an op or set of ops, we will
likely end up reconstructing the previous (N-1)-th derivatives (we
could theoretically avoid this by cleverly finding and reusing
previously-constructed gradients as we traverse the forward pass).

In the case of the If op, this means that we end up constructing the
same gradient functions multiple times when taking higher-order
derivatives. Prior to this change, we would always generate the same
function name for the same grad function.

This usually worked because the two functions would be identical, and
we already silently dedup identical functions (this is to ease
importing graphs with functions). However, it occasionally didn't work
because we ended up generating two different FunctionDefs with the
same name (I'm not sure why the FunctionDefs were different, but I'm
guessing it's the unordered_map in the TF_GraphToFunction
implementation).

The solution:
Rather than depend on the subtle deduping behavior, I made the cond_v2
implementation find unique names for all grad functions. This will
result in more functions being generated, but I think it makes the
behavior more obvious.

In addition, this change properly adds the If branch functions to the graph.

PiperOrigin-RevId: 199560887
---
 .../contrib/control_flow/python/cond_v2.py    | 42 +++++++++++--------
 .../control_flow/python/cond_v2_test.py       |  1 -
 tensorflow/python/framework/function.py       | 10 +++++
 3 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/tensorflow/contrib/control_flow/python/cond_v2.py b/tensorflow/contrib/control_flow/python/cond_v2.py
index 70a9af43a5..9ffad9caa9 100644
--- a/tensorflow/contrib/control_flow/python/cond_v2.py
+++ b/tensorflow/contrib/control_flow/python/cond_v2.py
@@ -23,7 +23,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.framework import function_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import function
@@ -93,9 +92,9 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   # graphs. These functions will capture tensors from the forward pass
   # functions.
   true_grad_graph = _create_grad_func(
-      true_graph, grads, "%sgrad" % true_graph.name)
+      true_graph, grads, _get_grad_fn_name(true_graph))
   false_grad_graph = _create_grad_func(
-      false_graph, grads, "%sgrad" % false_graph.name)
+      false_graph, grads, _get_grad_fn_name(false_graph))
 
   assert ([t.dtype for t in true_grad_graph.outputs] ==
           [t.dtype for t in false_grad_graph.outputs])
@@ -260,7 +259,6 @@ def _create_new_tf_function(func_graph):
   Returns:
     The name of the new TF_Function.
   """
-  func_graph.name = "%s_" % func_graph.name
   c_func = c_api.TF_GraphToFunction_wrapper(
       func_graph._c_graph,
       compat.as_str(func_graph.name),
@@ -271,20 +269,15 @@ def _create_new_tf_function(func_graph):
       [],
       None,  # opts
       None)  # description
-  c_func = c_api_util.ScopedTFFunction(c_func)
-  c_api.TF_GraphCopyFunction(
-      ops.get_default_graph()._c_graph, c_func.func, None)
-
-  # Add a _DefinedFunction to `Graph._functions` of the outer graph so that
-  # we can access it using `Graph._get_function` later.
-  # TODO(srbs): Consider adding a C API that can return a FunctionDef by name.
-  with c_api_util.tf_buffer() as buffer_:
-    c_api.TF_FunctionToFunctionDef(c_func.func, buffer_)
-    proto_data = c_api.TF_GetBuffer(buffer_)
-  function_def = function_pb2.FunctionDef()
-  function_def.ParseFromString(compat.as_bytes(proto_data))
-  func_graph._outer_graph._functions[
-      func_graph.name] = function._from_definition(function_def)
+  _ = c_api_util.ScopedTFFunction(c_func)
+
+  # TODO(b/109833212): this sucks, we're serializing the TF_Function*,
+  # deserializing it into a Python FunctionDef, then reserializing it to create
+  # a new TF_Function that we add to the graph.
+  fdef = function.function_def_from_tf_function(c_func)
+  defined_func = function._from_definition(fdef)
+  defined_func.add_to_graph(ops.get_default_graph())
+
   return func_graph.name
 
 
@@ -410,6 +403,19 @@ def _create_dummy_params(func_graph, template_tensors):
             for t in template_tensors]
 
 
+def _get_grad_fn_name(func_graph):
+  """Returns a unique name to use for the grad function of `func_graph`."""
+  name = "%s_grad" % func_graph.name
+
+  base_name = name
+  counter = 1
+  if ops.get_default_graph()._is_function(name):
+    name = "%s_%s" % (base_name, counter)
+    counter += 1
+
+  return name
+
+
 def _check_same_outputs(true_graph, false_graph):
   """Raises an error if true_graph and false_graph have different outputs."""
   true_output_types = [t.dtype for t in true_graph.outputs]
diff --git a/tensorflow/contrib/control_flow/python/cond_v2_test.py b/tensorflow/contrib/control_flow/python/cond_v2_test.py
index 7e299d1ad6..dcecefb520 100644
--- a/tensorflow/contrib/control_flow/python/cond_v2_test.py
+++ b/tensorflow/contrib/control_flow/python/cond_v2_test.py
@@ -82,7 +82,6 @@ class NewCondTest(test.TestCase):
     self._testCond(true_fn, false_fn, [y])
 
   def testSecondDerivative(self):
-    self.skipTest("b/109758172")
     pred = array_ops.placeholder(dtypes.bool, name="pred")
     x = constant_op.constant(3.0, name="x")
 
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 79ee57355d..82ecba310b 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -1172,3 +1172,13 @@ _DTYPE_TO_STR = {
     dtypes.qint32: "qi32",
     dtypes.bfloat16: "b16"
 }
+
+
+def function_def_from_tf_function(c_func):
+  """Converts a SWIG-wrapped TF_Function* to a FunctionDef proto."""
+  with c_api_util.tf_buffer() as buf:
+    c_api.TF_FunctionToFunctionDef(c_func, buf)
+    data = c_api.TF_GetBuffer(buf)
+  fdef = function_pb2.FunctionDef()
+  fdef.ParseFromString(compat.as_bytes(data))
+  return fdef
-- 
GitLab


From cf6e7096f5ffab77418ffd2e084972d99801d4f2 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Wed, 6 Jun 2018 17:34:06 -0700
Subject: [PATCH 406/610] Remove _USE_C_API test_util methods now that the C
 API is enabled by default.

This is in preparation for removing the _USE_C_API toggle altogether.

PiperOrigin-RevId: 199561250
---
 tensorflow/python/framework/test_util.py | 99 +-----------------------
 1 file changed, 3 insertions(+), 96 deletions(-)

diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 0c06d9aa41..4a6146e0a6 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -321,32 +321,6 @@ def NCHWToNHWC(input_tensor):
     return [input_tensor[a] for a in new_axes[ndims]]
 
 
-# TODO(skyewm): remove this eventually
-# pylint: disable=protected-access
-def _use_c_api_wrapper(fn, use_c_api, *args, **kwargs):
-  prev_value = ops._USE_C_API
-  ops._USE_C_API = use_c_api
-  try:
-    # Reset the default graph so it has the C API enabled. We call
-    # reset_default_graph() instead of creating a new default Graph context to
-    # make this robust to tests that call reset_default_graph(), which requires
-    # that the current default graph isn't nested.
-    ops.reset_default_graph()
-    fn(*args, **kwargs)
-  finally:
-    ops._USE_C_API = prev_value
-    # Make sure default graph reflects prev_value in case next test doesn't call
-    # reset_default_graph().
-    ops.reset_default_graph()
-
-
-# pylint: disable=protected-access
-
-
-def c_api_and_cuda_enabled():
-  return ops._USE_C_API and IsGoogleCudaEnabled()
-
-
 def skip_if(condition):
   """Skips the decorated function if condition is or evaluates to True.
 
@@ -372,46 +346,6 @@ def skip_if(condition):
   return real_skip_if
 
 
-# TODO(skyewm): remove this eventually
-def disable_c_api(fn):
-  """Decorator for disabling the C API on a test.
-
-  Note this disables the C API after running the test class's setup/teardown
-  methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  def wrapper(*args, **kwargs):
-    _use_c_api_wrapper(fn, False, *args, **kwargs)
-
-  return wrapper
-
-
-# TODO(skyewm): remove this eventually
-def enable_c_api(fn):
-  """Decorator for enabling the C API on a test.
-
-  Note this enables the C API after running the test class's setup/teardown
-  methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  def wrapper(*args, **kwargs):
-    _use_c_api_wrapper(fn, True, *args, **kwargs)
-
-  return wrapper
-
-
 def enable_c_shapes(fn):
   """Decorator for enabling C shapes on a test.
 
@@ -425,46 +359,19 @@ def enable_c_shapes(fn):
     The wrapped function
   """
 
+  # pylint: disable=protected-access
   def wrapper(*args, **kwargs):
     prev_value = ops._USE_C_SHAPES
-    # Only use C shapes if the C API is already enabled.
-    ops._USE_C_SHAPES = ops._USE_C_API
+    ops._USE_C_SHAPES = True
     try:
       fn(*args, **kwargs)
     finally:
       ops._USE_C_SHAPES = prev_value
+  # pylint: enable=protected-access
 
   return wrapper
 
 
-# This decorator is a hacky way to run all the test methods in a decorated
-# class with and without C API enabled.
-# TODO(iga): Remove this and its uses once we switch to using C API by default.
-def with_c_api(cls):
-  """Adds methods that call original methods but with C API enabled.
-
-  Note this enables the C API in new methods after running the test class's
-  setup method. This can be a problem if some objects are created in it
-  before the C API is enabled.
-
-  Args:
-    cls: class to decorate
-
-  Returns:
-    cls with new test methods added
-  """
-  # If the C API is already enabled, don't do anything. Some tests break if the
-  # same test is run twice, so this allows us to turn on the C API by default
-  # without breaking these tests.
-  if ops._USE_C_API:
-    return cls
-
-  for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
-      setattr(cls, name + "WithCApi", enable_c_api(value))
-  return cls
-
-
 def with_c_shapes(cls):
   """Adds methods that call original methods but with C API shapes enabled.
 
-- 
GitLab


From f6ead2178d920dcc4876b4e154900b218056555f Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Wed, 6 Jun 2018 18:11:33 -0700
Subject: [PATCH 407/610] Download tf.keras datasets from GCS and add license
 information.

PiperOrigin-RevId: 199565413
---
 tensorflow/python/keras/datasets/boston_housing.py |  3 ++-
 tensorflow/python/keras/datasets/fashion_mnist.py  |  8 +++++++-
 tensorflow/python/keras/datasets/imdb.py           |  6 ++++--
 tensorflow/python/keras/datasets/mnist.py          | 10 +++++++++-
 tensorflow/python/keras/datasets/reuters.py        |  6 ++++--
 5 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/tensorflow/python/keras/datasets/boston_housing.py b/tensorflow/python/keras/datasets/boston_housing.py
index 8c043638c0..4c4cab8c08 100644
--- a/tensorflow/python/keras/datasets/boston_housing.py
+++ b/tensorflow/python/keras/datasets/boston_housing.py
@@ -39,9 +39,10 @@ def load_data(path='boston_housing.npz', test_split=0.2, seed=113):
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
   """
   assert 0 <= test_split < 1
+  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/keras-datasets/boston_housing.npz',
+      origin=origin_folder + 'boston_housing.npz',
       file_hash=
       'f553886a1f8d56431e820c5b82552d9d95cfcb96d1e678153f8839538947dff5')
   f = np.load(path)
diff --git a/tensorflow/python/keras/datasets/fashion_mnist.py b/tensorflow/python/keras/datasets/fashion_mnist.py
index 45e27aad34..3f4c6c7413 100644
--- a/tensorflow/python/keras/datasets/fashion_mnist.py
+++ b/tensorflow/python/keras/datasets/fashion_mnist.py
@@ -33,9 +33,15 @@ def load_data():
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  License:
+      The copyright for Fashion-MNIST is held by Zalando SE.
+      Fashion-MNIST is licensed under the [MIT license](
+      https://github.com/zalandoresearch/fashion-mnist/blob/master/LICENSE).
+
   """
   dirname = os.path.join('datasets', 'fashion-mnist')
-  base = 'http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/'
+  base = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   files = [
       'train-labels-idx1-ubyte.gz', 'train-images-idx3-ubyte.gz',
       't10k-labels-idx1-ubyte.gz', 't10k-images-idx3-ubyte.gz'
diff --git a/tensorflow/python/keras/datasets/imdb.py b/tensorflow/python/keras/datasets/imdb.py
index 411b3e8635..b73b024162 100644
--- a/tensorflow/python/keras/datasets/imdb.py
+++ b/tensorflow/python/keras/datasets/imdb.py
@@ -77,9 +77,10 @@ def load_data(path='imdb.npz',
   if kwargs:
     raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
 
+  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
+      origin=origin_folder + 'imdb.npz',
       file_hash='599dadb1135973df5b59232a0e9a887c')
   with np.load(path) as f:
     x_train, labels_train = f['x_train'], f['y_train']
@@ -140,9 +141,10 @@ def get_word_index(path='imdb_word_index.json'):
   Returns:
       The word index dictionary.
   """
+  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/imdb_word_index.json',
+      origin=origin_folder + 'imdb_word_index.json',
       file_hash='bfafd718b763782e994055a2d397834f')
   with open(path) as f:
     return json.load(f)
diff --git a/tensorflow/python/keras/datasets/mnist.py b/tensorflow/python/keras/datasets/mnist.py
index 631189731a..03564accc7 100644
--- a/tensorflow/python/keras/datasets/mnist.py
+++ b/tensorflow/python/keras/datasets/mnist.py
@@ -34,10 +34,18 @@ def load_data(path='mnist.npz'):
 
   Returns:
       Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
+
+  License:
+      Yann LeCun and Corinna Cortes hold the copyright of MNIST dataset,
+      which is a derivative work from original NIST datasets.
+      MNIST dataset is made available under the terms of the
+      [Creative Commons Attribution-Share Alike 3.0 license.](
+      https://creativecommons.org/licenses/by-sa/3.0/)
   """
+  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/img-datasets/mnist.npz',
+      origin=origin_folder + 'mnist.npz',
       file_hash='8a61469f7ea1b51cbae51d4f78837e45')
   f = np.load(path)
   x_train, y_train = f['x_train'], f['y_train']
diff --git a/tensorflow/python/keras/datasets/reuters.py b/tensorflow/python/keras/datasets/reuters.py
index b070ba8d12..2120b4b242 100644
--- a/tensorflow/python/keras/datasets/reuters.py
+++ b/tensorflow/python/keras/datasets/reuters.py
@@ -75,9 +75,10 @@ def load_data(path='reuters.npz',
   if kwargs:
     raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))
 
+  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/reuters.npz',
+      origin=origin_folder + 'reuters.npz',
       file_hash='87aedbeb0cb229e378797a632c1997b6')
   with np.load(path) as f:
     xs, labels = f['x'], f['y']
@@ -124,9 +125,10 @@ def get_word_index(path='reuters_word_index.json'):
   Returns:
       The word index dictionary.
   """
+  origin_folder = 'https://storage.googleapis.com/tensorflow/tf-keras-datasets/'
   path = get_file(
       path,
-      origin='https://s3.amazonaws.com/text-datasets/reuters_word_index.json',
+      origin=origin_folder + 'reuters_word_index.json',
       file_hash='4d44cc38712099c9e383dc6e5f11a921')
   f = open(path)
   data = json.load(f)
-- 
GitLab


From 8c649dd05d97c015150abcffc2641076668966e5 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Wed, 6 Jun 2018 18:12:02 -0700
Subject: [PATCH 408/610] Automated g4 rollback of changelist 199476694

PiperOrigin-RevId: 199565455
---
 tensorflow/contrib/distribute/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 1e1d503744..19ec2965fb 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -312,6 +312,7 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_pip",
+        "noguitar",  # TODO(b/109653107): test is flaky.
     ],
 )
 
-- 
GitLab


From 74fd9ce659c959a322598d5c64f1c4f3f6e871a5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 18:31:41 -0700
Subject: [PATCH 409/610] Update variable recording and add benchmark with
 defun.

PiperOrigin-RevId: 199567244
---
 .../eager/python/examples/l2hmc/l2hmc.py      |  98 ++-------
 .../eager/python/examples/l2hmc/l2hmc_test.py | 202 +++++++++++++-----
 .../python/examples/l2hmc/neural_nets.py      |   2 -
 3 files changed, 173 insertions(+), 129 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
index 98b4ce1b26..729d8525fa 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc.py
@@ -57,11 +57,6 @@ class Dynamics(tf.keras.Model):
     self.eps = tfe.Variable(
         initial_value=eps, name="eps", dtype=tf.float32, trainable=True)
 
-    # TODO(lxuechen): Remove this after model.add_weight is in place
-    self.vars_not_in_layers = [self.eps]
-    self.vars_not_in_layers += self.position_fn.vars_not_in_layers
-    self.vars_not_in_layers += self.momentum_fn.vars_not_in_layers
-
   def apply_transition(self, position):
     """Propose a new state and perform the accept or reject step."""
 
@@ -290,86 +285,35 @@ class Dynamics(tf.keras.Model):
     return grad
 
 
-# Defining loss and grads for training
-def compute_loss(x, dynamics, scale=.1, eps=1e-4):
-  """Compute loss defined in equation (8)."""
-
-  z = tf.random_normal(tf.shape(x))
-  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
-  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
-
-  # Add eps for numerical stability; following released impl
-  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
-  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
-
-  loss = tf.reduce_mean(
-      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
-
-  return loss, x_out
-
-
-def loss_and_grads(x, dynamics):
-  """Obtain loss value and gradients."""
-
-  with tf.GradientTape() as tape:
-    loss_val, x_out = compute_loss(x, dynamics)
-
-  vars_ = dynamics.variables + dynamics.vars_not_in_layers
-  grads = tape.gradient(loss_val, vars_)
-
-  return loss_val, grads, x_out
-
-
-def warmup(dynamics, optimizer, n_iters=1, n_samples=200):
-  """Warmup optimization to reduce overhead."""
-
-  samples = tf.random_normal(
-      shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
-
-  for _ in range(n_iters):
-    _, grads, samples = loss_and_grads(samples, dynamics)
-    vars_ = dynamics.variables + dynamics.vars_not_in_layers
-    optimizer.apply_gradients(zip(grads, vars_))
-
-
-def fit(dynamics,
-        optimizer,
-        n_samples=200,
-        n_iters=5000,
-        verbose=True,
-        logdir=None):
-  """Fit L2HMC sampler with given log-likelihood function."""
-
-  if logdir:
-    summary_writer = tf.contrib.summary.create_file_writer(logdir)
+# Examples of unnormalized log density/probabilities
+def get_scg_energy_fn():
+  """Get energy function for 2d strongly correlated Gaussian."""
 
-  samples = tf.random_normal(
-      shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
+  # Avoid recreating tf constants on each invocation of gradients
+  mu = tf.constant([0., 0.])
+  sigma = tf.constant([[50.05, -49.95], [-49.95, 50.05]])
+  sigma_inv = tf.matrix_inverse(sigma)
 
-  tf.train.get_or_create_global_step()
-  for i in range(n_iters):
-    loss, grads, samples = loss_and_grads(samples, dynamics)
-    # TODO(lxuechen): Proper learning rate decay
-    grads_ = [grad * .96**(i // 1000) for grad in grads]
-    vars_ = dynamics.variables + dynamics.vars_not_in_layers
-    optimizer.apply_gradients(
-        zip(grads_, vars_), global_step=tf.train.get_global_step())
+  def energy(x):
+    """Unnormalized log density/energy of 2d strongly correlated Gaussian."""
 
-    if verbose:
-      print("Iteration %d: loss %.4f" % (i, loss))
+    xmmu = x - mu
+    return .5 * tf.diag_part(
+        tf.matmul(tf.matmul(xmmu, sigma_inv), tf.transpose(xmmu)))
 
-    if logdir:
-      with summary_writer.as_default():
-        with tf.contrib.summary.always_record_summaries():
-          tf.contrib.summary.scalar("loss", loss)
+  return energy
 
 
-def get_scg_energy_fn():
+def get_multivariate_gaussian_energy_fn(x_dim=2):
   """Get energy function for 2d strongly correlated Gaussian."""
 
-  # Avoid recreating tf constants on each invocation of gradients
-  mu = tf.constant([0., 0.])
-  sigma = tf.constant([[50.05, -49.95], [-49.95, 50.05]])
+  mu = tf.random_normal(shape=[x_dim])
+  # Lower triangularize and positive diagonal
+  l = tf.sigmoid(
+      tf.matrix_band_part(tf.random_normal(shape=[x_dim, x_dim]), -1, 0))
+  # Exploit Cholesky decomposition
+  sigma = tf.matmul(l, tf.transpose(l))
+  sigma *= 100.  # Small covariance causes extreme numerical instability
   sigma_inv = tf.matrix_inverse(sigma)
 
   def energy(x):
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
index 522a7c9380..e33b4cae4c 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/l2hmc_test.py
@@ -32,16 +32,83 @@ def get_default_hparams():
       n_samples=200,
       n_steps=10,
       eps=.1,
-      n_iters=5,
-      learning_rate=.001,
-      n_warmup_iters=1)
+      n_iters=10,
+      learning_rate=.0003,
+      n_warmup_iters=3)
+
+
+# Relevant functions for benchmarking
+def compute_loss(dynamics, x, scale=.1, eps=1e-4):
+  """Compute loss defined in equation (8)."""
+
+  z = tf.random_normal(tf.shape(x))
+  x_, _, x_accept_prob, x_out = dynamics.apply_transition(x)
+  z_, _, z_accept_prob, _ = dynamics.apply_transition(z)
+
+  # Add eps for numerical stability; following released impl
+  x_loss = tf.reduce_sum((x - x_)**2, axis=1) * x_accept_prob + eps
+  z_loss = tf.reduce_sum((z - z_)**2, axis=1) * z_accept_prob + eps
+
+  loss = tf.reduce_mean(
+      (1. / x_loss + 1. / z_loss) * scale - (x_loss + z_loss) / scale, axis=0)
+
+  return loss, x_out
+
+
+def loss_and_grads(dynamics, x, loss_fn=compute_loss):
+  """Obtain loss value and gradients."""
+
+  with tf.GradientTape() as tape:
+    loss_val, x_out = loss_fn(dynamics, x)
+  grads = tape.gradient(loss_val, dynamics.variables)
+
+  return loss_val, grads, x_out
+
+
+def warmup(dynamics, optimizer, n_iters=1, n_samples=200, loss_fn=compute_loss):
+  """Warmup optimization to reduce overhead."""
+
+  samples = tf.random_normal(
+      shape=[n_samples, dynamics.x_dim], dtype=tf.float32)
+
+  for _ in range(n_iters):
+    _, grads, samples = loss_and_grads(dynamics, samples, loss_fn=loss_fn)
+    optimizer.apply_gradients(zip(grads, dynamics.variables))
+
+
+def fit(dynamics,
+        samples,
+        optimizer,
+        loss_fn=compute_loss,
+        n_iters=5000,
+        verbose=True,
+        logdir=None,
+        decay_lr=True):
+  """Fit L2HMC sampler with given log-likelihood function."""
+
+  if logdir:
+    summary_writer = tf.contrib.summary.create_file_writer(logdir)
+
+  for i in range(n_iters):
+    loss, grads, samples = loss_and_grads(dynamics, samples, loss_fn=loss_fn)
+    # TODO(lxuechen): Proper learning rate decay
+    if decay_lr:
+      grads = [grad * .96**(i // 1000) for grad in grads]
+    optimizer.apply_gradients(zip(grads, dynamics.variables))
+    if verbose:
+      print("Iteration %d: loss %.4f" % (i, loss))
+
+    if logdir:
+      with summary_writer.as_default():
+        with tf.contrib.summary.always_record_summaries():
+          tf.contrib.summary.scalar("loss", loss)
 
 
 class L2hmcTest(tf.test.TestCase):
   """Unit tests for l2hmc in both eager and graph mode."""
 
-  def testComputeLoss(self):
-    """Testing function l2hmc.compute_loss in both graph and eager mode."""
+  def test_apply_transition(self):
+    """Testing function `Dynamics.apply_transition` in graph and eager mode."""
 
     # Eager mode testing
     hparams = get_default_hparams()
@@ -51,12 +118,12 @@ class L2hmcTest(tf.test.TestCase):
         n_steps=hparams.n_steps,
         eps=hparams.eps)
     samples = tf.random_normal(shape=[hparams.n_samples, hparams.x_dim])
-    loss, x_out = l2hmc.compute_loss(samples, dynamics)
+    x_, v_, x_accept_prob, x_out = dynamics.apply_transition(samples)
 
-    # Check shape and numerical stability
+    self.assertEqual(x_.shape, v_.shape)
     self.assertEqual(x_out.shape, samples.shape)
-    self.assertEqual(loss.shape, [])
-    self.assertAllClose(loss.numpy(), loss.numpy(), rtol=1e-5)
+    self.assertEqual(x_.shape, x_out.shape)
+    self.assertEqual(x_accept_prob.shape, (hparams.n_samples,))
 
     # Graph mode testing
     with tf.Graph().as_default():
@@ -66,65 +133,49 @@ class L2hmcTest(tf.test.TestCase):
           n_steps=hparams.n_steps,
           eps=hparams.eps)
       x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
-      loss, x_out = l2hmc.compute_loss(x, dynamics)
+      x_, v_, x_accept_prob, x_out = dynamics.apply_transition(x)
       samples = npr.normal(size=[hparams.n_samples, hparams.x_dim])
 
       with tf.Session() as sess:
         sess.run(tf.global_variables_initializer())
-        loss_np, x_out_np = sess.run([loss, x_out], feed_dict={x: samples})
+        np_x_, np_v_, np_x_accept_prob, np_x_out = sess.run(
+            [x_, v_, x_accept_prob, x_out], feed_dict={x: samples})
 
-        # Check shape and numerical stability
-        self.assertEqual(x_out_np.shape, samples.shape)
-        self.assertEqual(loss_np.shape, ())
-        self.assertAllClose(loss_np, loss_np, rtol=1e-5)
+        self.assertEqual(np_x_.shape, np_v_.shape)
+        self.assertEqual(samples.shape, np_x_out.shape)
+        self.assertEqual(np_x_.shape, np_x_out.shape)
+        self.assertEqual(np_x_accept_prob.shape, (hparams.n_samples,))
 
 
 class L2hmcBenchmark(tf.test.Benchmark):
   """Eager and graph benchmarks for l2hmc."""
 
-  def benchmarkEagerL2hmc(self):
-    """Benchmark Eager performance."""
-
-    hparams = get_default_hparams()
-    dynamics = l2hmc.Dynamics(
-        x_dim=hparams.x_dim,
-        loglikelihood_fn=l2hmc.get_scg_energy_fn(),
-        n_steps=hparams.n_steps,
-        eps=hparams.eps)
-    # TODO(lxuechen): Add learning rate decay
-    optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
-
-    # Warmup to reduce initialization effect when timing
-    l2hmc.warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters)
+  def _get_energy_fn(self):
+    """Get specific energy function according to FLAGS."""
 
-    # Time
-    start_time = time.time()
-    l2hmc.fit(
-        dynamics,
-        optimizer,
-        n_samples=hparams.n_samples,
-        n_iters=hparams.n_iters)
-    wall_time = time.time() - start_time
-    examples_per_sec = hparams.n_samples / wall_time
+    if FLAGS.energy_fn == "scg":
+      energy_fn = l2hmc.get_scg_energy_fn()
+    elif FLAGS.energy_fn == "multivariate_gaussian":
+      energy_fn = l2hmc.get_multivariate_gaussian_energy_fn(x_dim=FLAGS.x_dim)
+    else:
+      raise ValueError("No such energy function %s" % FLAGS.energy_fn)
 
-    self.report_benchmark(
-        name="eager_train_%s" % ("gpu" if tfe.num_gpus() > 0 else "cpu"),
-        iters=hparams.n_iters,
-        extras={"examples_per_sec": examples_per_sec},
-        wall_time=wall_time)
+    return energy_fn
 
-  def benchmarkGraphL2hmc(self):
+  def benchmark_graph(self):
     """Benchmark Graph performance."""
 
     hparams = get_default_hparams()
+    tf.reset_default_graph()
     with tf.Graph().as_default():
+      energy_fn = self._get_energy_fn()
       dynamics = l2hmc.Dynamics(
           x_dim=hparams.x_dim,
-          loglikelihood_fn=l2hmc.get_scg_energy_fn(),
+          loglikelihood_fn=energy_fn,
           n_steps=hparams.n_steps,
           eps=hparams.eps)
       x = tf.placeholder(tf.float32, shape=[None, hparams.x_dim])
-      loss, x_out = l2hmc.compute_loss(x, dynamics)
+      loss, x_out = compute_loss(dynamics, x)
 
       global_step = tf.Variable(0., name="global_step", trainable=False)
       learning_rate = tf.train.exponential_decay(
@@ -138,14 +189,15 @@ class L2hmcBenchmark(tf.test.Benchmark):
         # Warmup to reduce initialization effect when timing
         samples = npr.normal(size=[hparams.n_samples, hparams.x_dim])
         for _ in range(hparams.n_warmup_iters):
-          samples, _, _, _ = sess.run(
+          _, _, _, _ = sess.run(
               [x_out, loss, train_op, learning_rate], feed_dict={x: samples})
 
-        # Time
+        # Training
         start_time = time.time()
-        for _ in range(hparams.n_iters):
-          samples, _, _, _ = sess.run(
+        for i in range(hparams.n_iters):
+          samples, loss_np, _, _ = sess.run(
               [x_out, loss, train_op, learning_rate], feed_dict={x: samples})
+          print("Iteration %d: loss %.4f" % (i, loss_np))
         wall_time = time.time() - start_time
         examples_per_sec = hparams.n_samples / wall_time
 
@@ -156,7 +208,57 @@ class L2hmcBenchmark(tf.test.Benchmark):
             extras={"examples_per_sec": examples_per_sec},
             wall_time=wall_time)
 
+  def benchmark_eager(self):
+    self._benchmark_eager()
+
+  def benchmark_eager_defun(self):
+    self._benchmark_eager(defun=True)
+
+  def _benchmark_eager(self, defun=False):
+    """Benchmark Eager performance."""
+
+    hparams = get_default_hparams()
+    energy_fn = self._get_energy_fn()
+    dynamics = l2hmc.Dynamics(
+        x_dim=hparams.x_dim,
+        loglikelihood_fn=energy_fn,
+        n_steps=hparams.n_steps,
+        eps=hparams.eps)
+    optimizer = tf.train.AdamOptimizer(learning_rate=hparams.learning_rate)
+    loss_fn = tfe.defun(compute_loss) if defun else compute_loss
+
+    # Warmup to reduce initialization effect when timing
+    warmup(dynamics, optimizer, n_iters=hparams.n_warmup_iters, loss_fn=loss_fn)
+
+    # Training
+    samples = tf.random_normal(
+        shape=[hparams.n_samples, hparams.x_dim], dtype=tf.float32)
+    start_time = time.time()
+    fit(dynamics,
+        samples,
+        optimizer,
+        loss_fn=loss_fn,
+        n_iters=hparams.n_iters,
+        decay_lr=True)
+    wall_time = time.time() - start_time
+    examples_per_sec = hparams.n_samples / wall_time
+
+    self.report_benchmark(
+        name="eager_train_%s%s" % ("gpu" if tf.test.is_gpu_available() else
+                                   "cpu", "_defun" if defun else ""),
+        iters=hparams.n_iters,
+        extras={"examples_per_sec": examples_per_sec},
+        wall_time=wall_time)
+
+    del dynamics
+    del loss_fn
+
 
 if __name__ == "__main__":
+  tf.flags.DEFINE_string("energy_fn", "scg",
+                         ("The energy function/unnormalized log-probability. "
+                          "Either be `scg` or `multivariate_gaussian`"))
+  tf.flags.DEFINE_integer("x_dim", 2, "Dimensionality of observation space.")
+  FLAGS = tf.flags.FLAGS
   tf.enable_eager_execution()
   tf.test.main()
diff --git a/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py b/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py
index c902e1f1f4..e230ad5e25 100644
--- a/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py
+++ b/tensorflow/contrib/eager/python/examples/l2hmc/neural_nets.py
@@ -57,8 +57,6 @@ class GenericNet(tf.keras.Model):
         initial_value=tf.zeros([1, x_dim]),
         name='coeff_transformation',
         trainable=True)
-    # TODO(lxuechen): Remove this after model.add_weight is in place
-    self.vars_not_in_layers = [self.coeff_scale, self.coeff_transformation]
 
   def call(self, inputs):
     v, x, t = inputs
-- 
GitLab


From cccbb9b7d4b1e9df592faca1d590a3484661496b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Wed, 6 Jun 2018 18:39:13 -0700
Subject: [PATCH 410/610] Cache the rematerializable status.

PiperOrigin-RevId: 199567935
---
 .../xla/service/hlo_rematerialization.cc      | 29 +++++++++++++++----
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 39b85de0f1..bd1d9935bd 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -71,6 +71,20 @@ bool IsRematerializable(const HloInstruction* instruction) {
   }
 }
 
+// Checks whether an instruction can be rematerialized, by looking up the
+// cache before, and eventually calling the IsRematerializable() API.
+bool CanBeRematerialized(
+    const HloInstruction* instruction,
+    tensorflow::gtl::FlatMap<const HloInstruction*, bool>* remat_able) {
+  auto it = remat_able->find(instruction);
+  if (it != remat_able->end()) {
+    return it->second;
+  }
+  bool rematerializable = IsRematerializable(instruction);
+  (*remat_able)[instruction] = rematerializable;
+  return rematerializable;
+}
+
 // Type holding a unique identifier for each Buffer object.
 using BufferId = int64;
 using BufferIdList = tensorflow::gtl::InlinedVector<BufferId, 3>;
@@ -843,9 +857,10 @@ int64 RematerializationCost(const HloInstruction* instruction,
 // candidate which reduce memory use at the program point of the current
 // instruction as indicated by memory_tracker. nullptr is returned if no
 // candidate can be found.
-Item* PickRematerializationCandidate(const MemoryUsageTracker& memory_tracker,
-                                     const InstructionList& instruction_list,
-                                     int64 memory_limit_bytes) {
+Item* PickRematerializationCandidate(
+    const MemoryUsageTracker& memory_tracker,
+    const InstructionList& instruction_list, int64 memory_limit_bytes,
+    tensorflow::gtl::FlatMap<const HloInstruction*, bool>* remat_able) {
   Item* best_item = nullptr;
   int64 best_cost = 0;
 
@@ -869,8 +884,7 @@ Item* PickRematerializationCandidate(const MemoryUsageTracker& memory_tracker,
               << " is excluded from rematerialization";
       continue;
     }
-
-    if (!IsRematerializable(candidate)) {
+    if (!CanBeRematerialized(candidate, remat_able)) {
       VLOG(5) << "candidate " << candidate->name()
               << " not viable: is not rematerializable";
       continue;
@@ -974,6 +988,9 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   // blacklist.
   tensorflow::gtl::FlatSet<const HloInstruction*> remat_move_instructions;
 
+  // The map from instructions to their rematerializable status.
+  tensorflow::gtl::FlatMap<const HloInstruction*, bool> remat_able;
+
   // The peak memory of the computation at any point in the instruction
   // sequence.
   int64 peak_memory = memory_tracker.memory_usage();
@@ -1011,7 +1028,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
               << ", limit is " << HumanReadableNumBytes(memory_limit_bytes);
 
       Item* best_item = PickRematerializationCandidate(
-          memory_tracker, instruction_list, memory_limit_bytes);
+          memory_tracker, instruction_list, memory_limit_bytes, &remat_able);
 
       if (best_item == nullptr) {
         VLOG(3) << "Unable to find rematerialization candidate at program "
-- 
GitLab


From 39cb0e4e5d7a1952178af66c74c4c40d44913f55 Mon Sep 17 00:00:00 2001
From: Jianwei Xie <xiejw@google.com>
Date: Wed, 6 Jun 2018 19:22:24 -0700
Subject: [PATCH 411/610] Fix the docstring as it is stale. The initializer has
 no default in   EmbeddingColumnLayer.

PiperOrigin-RevId: 199571833
---
 tensorflow/python/feature_column/feature_column.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index 59801efc26..af2ead9b84 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -1782,9 +1782,7 @@ class _EmbeddingColumnLayer(base.Layer):
     Args:
       embedding_shape: Shape of the embedding variable used for lookup.
       initializer: A variable initializer function to be used in embedding
-        variable initialization. If not specified, defaults to
-        `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-        `1/sqrt(dimension)`.
+        variable initialization.
       weight_collections: A list of collection names to which the Variable will
         be added. Note that, variables will also be added to collections
         `tf.GraphKeys.GLOBAL_VARIABLES` and `ops.GraphKeys.MODEL_VARIABLES`.
-- 
GitLab


From cd5fa0122bc7b89a461d91e54adfc4fa006a8580 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 6 Jun 2018 21:54:03 -0700
Subject: [PATCH 412/610] Disable broken keras_test on guitar.

PiperOrigin-RevId: 199581934
---
 tensorflow/contrib/distribute/python/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 19ec2965fb..9624abd199 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -587,6 +587,7 @@ cuda_py_test(
     ],
     tags = [
         "multi_and_single_gpu",
+        "noguitar",
         "notsan",
     ],
 )
-- 
GitLab


From a82c2b8a129555df4b958e55f49682f5aeaddf12 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 6 Jun 2018 22:01:34 -0700
Subject: [PATCH 413/610] Disable scoped_allocator_test in msan

PiperOrigin-RevId: 199582393
---
 tensorflow/core/grappler/optimizers/BUILD | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 0e22d4add8..20887bc218 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -793,6 +793,9 @@ tf_cc_test(
     name = "scoped_allocator_optimizer_test",
     size = "small",
     srcs = ["scoped_allocator_optimizer_test.cc"],
+    tags = [
+        "nomsan",
+    ],
     deps = [
         ":scoped_allocator_optimizer",
         "//tensorflow/cc:cc_ops",
-- 
GitLab


From e4e2708d4a9b15e29ff6e52afe96354b2486e239 Mon Sep 17 00:00:00 2001
From: Gunhan Gulsoy <gunan@google.com>
Date: Wed, 6 Jun 2018 22:03:19 -0700
Subject: [PATCH 414/610] Disabling broken zip_test_conv

PiperOrigin-RevId: 199582571
---
 tensorflow/contrib/lite/build_def.bzl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index aa6a60dc9e..66d9a0dd44 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -201,7 +201,7 @@ def generated_test_models():
         "concat",
         "constant",
         "control_dep",
-        "conv",
+        # "conv",
         "depthwiseconv",
         "div",
         "exp",
-- 
GitLab


From 9aa11542837e8f52d110f6e00d8e0da96e148937 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Wed, 6 Jun 2018 23:33:23 -0700
Subject: [PATCH 415/610] ArgMax supports quantization, so make the
 transformation know that.

PiperOrigin-RevId: 199588428
---
 tensorflow/contrib/lite/toco/graph_transformations/quantize.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index 142841fcc4..ab24c4f996 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -60,7 +60,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowGreaterEqual ||
          type == OperatorType::kTensorFlowLess ||
          type == OperatorType::kTensorFlowLessEqual ||
-         type == OperatorType::kSelect;
+         type == OperatorType::kSelect || type == OperatorType::kArgMax;
 }
 
 const MinMax& GetOrComputeMinMax(Model* model, const string& array_name) {
-- 
GitLab


From c2368f875b53e9144a1803a3e67c5a61aa9c5862 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 01:20:14 -0700
Subject: [PATCH 416/610] Apply if_override_eigen_strong_inline to three more
 ops

I'm seeing similar issues as #10521 happening to three more ops.

So adding if_override_eigen_strong_inline to them to avoid long compiling time.

PiperOrigin-RevId: 199597421
---
 tensorflow/core/kernels/BUILD | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index c7c7879714..5e4c8a78b0 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -2812,6 +2812,9 @@ tf_kernel_library(
     srcs = [] + if_mkl([
         "mkl_batch_matmul_op.cc",
     ]),
+    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
+    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "batch_matmul_op",
     deps = MATH_DEPS + if_mkl([
         "//third_party/mkl:intel_binary_blob",
@@ -2879,6 +2882,9 @@ tf_kernel_library(
         "mkl_matmul_op.cc",
     ]),
     hdrs = ["matmul_op.h"],
+    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
+    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm": [
             "TENSORFLOW_USE_LIBXSMM",
@@ -3248,8 +3254,7 @@ tf_kernel_library(
         "//conditions:default": [],
     }),
     # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
-    # So that it doesn't take 20 minutes to compile conv_grad_ops_3d.cc and conv_ops_3d.cc
-    # on Windows. See https://github.com/tensorflow/tensorflow/issues/10521
+    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
     copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     defines = select({
         ":xsmm_convolutions": [
@@ -3395,6 +3400,9 @@ tf_kernel_library(
 
 tf_kernel_library(
     name = "lrn_op",
+    # Override EIGEN_STRONG_INLINE to inline when --define=override_eigen_strong_inline=true,
+    # to avoid long compiling time. See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"]),
     prefix = "lrn_op",
     deps = NN_DEPS,
 )
-- 
GitLab


From b079c0388b4393262b652cdbf1a30ed4177238cb Mon Sep 17 00:00:00 2001
From: Karan Kaw <karankaw@hotmail.com>
Date: Thu, 7 Jun 2018 14:20:07 +0530
Subject: [PATCH 417/610] Rephrased content, included dependency download link

---
 tensorflow/docs_src/install/install_java.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index bbbabb6086..fcc1a85b6b 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -181,7 +181,7 @@ Take the following steps to install TensorFlow for Java on Windows:
      [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
-__Note__: Please ensure that _MS Visual C++ 2015 Redistributable_ package is installed on Windows system as tensorflow JNI library (*tensorflow_jni.dll*) uses them at runtime.
+__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp...dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
 
 ### Validate the installation
 
-- 
GitLab


From c70b7128bfb9f0283c60bbec8fd7b0c12f741d95 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 02:05:06 -0700
Subject: [PATCH 418/610] Implementation of TensorFlowEqual and
 TensorFlowNotEqual.

PiperOrigin-RevId: 199602232
---
 tensorflow/contrib/lite/build_def.bzl         |   2 +
 tensorflow/contrib/lite/builtin_ops.h         |   2 +
 .../lite/g3doc/tf_ops_compatibility.md        |  30 +-
 .../contrib/lite/kernels/comparisons.cc       |  66 ++++
 .../contrib/lite/kernels/comparisons_test.cc  | 333 +++++++++++-------
 .../internal/reference/reference_ops.h        |  12 +
 tensorflow/contrib/lite/kernels/register.cc   |   4 +
 tensorflow/contrib/lite/model.cc              |   2 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |   2 +
 tensorflow/contrib/lite/schema/schema.fbs     |  10 +
 .../contrib/lite/schema/schema_generated.h    | 236 ++++++++++++-
 .../contrib/lite/testing/generate_examples.py |  68 ++++
 .../contrib/lite/toco/export_tensorflow.cc    |   4 +
 .../propagate_array_data_types.cc             |   2 +
 .../propagate_fixed_sizes.cc                  |   2 +
 .../contrib/lite/toco/import_tensorflow.cc    |   6 +
 tensorflow/contrib/lite/toco/model.h          |  18 +
 .../contrib/lite/toco/tflite/operator.cc      |   4 +
 .../contrib/lite/toco/tflite/operator_test.cc |   4 +
 tensorflow/contrib/lite/toco/tooling_util.cc  |   2 +
 20 files changed, 666 insertions(+), 143 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 66d9a0dd44..13d9a463fb 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -204,6 +204,7 @@ def generated_test_models():
         # "conv",
         "depthwiseconv",
         "div",
+        "equal",
         "exp",
         "expand_dims",
         "floor",
@@ -226,6 +227,7 @@ def generated_test_models():
         "minimum",
         "mul",
         "neg",
+        "not_equal",
         "pad",
         "padv2",
         # "prelu",
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index fc6fdd6eef..7b10b69f43 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -96,6 +96,8 @@ typedef enum {
   kTfLiteBuiltinSparseToDense = 68,
   kTfLiteBuiltinTile = 69,
   kTfLiteBuiltinExpandDims = 70,
+  kTfLiteBuiltinEqual = 71,
+  kTfLiteBuiltinNotEqual = 72,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 27e7d25bf1..19145281fa 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -95,11 +95,7 @@ Here is a list of TensorFlow operations that are usually removed from the graph:
 *   [tf.divide](https://www.tensorflow.org/api_docs/python/tf/divide)
 *   [tf.fake_quant_with_min_max_args](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_args)
 *   [tf.fake_quant_with_min_max_vars](https://www.tensorflow.org/api_docs/python/tf/fake_quant_with_min_max_vars)
-*   [tf.greater](https://www.tensorflow.org/api_docs/python/tf/greater)
-*   [tf.greater_equal](https://www.tensorflow.org/api_docs/python/tf/greater_equal)
 *   [tf.identity](https://www.tensorflow.org/api_docs/python/tf/identity)
-*   [tf.less](https://www.tensorflow.org/api_docs/python/tf/less)
-*   [tf.less_equal](https://www.tensorflow.org/api_docs/python/tf/less_equal)
 *   [tf.maximum](https://www.tensorflow.org/api_docs/python/tf/maximum)
 *   [tf.minimum](https://www.tensorflow.org/api_docs/python/tf/minimum)
 *   [tf.multiply](https://www.tensorflow.org/api_docs/python/tf/multiply)
@@ -258,6 +254,19 @@ Options {
 }
 ```
 
+**EQUAL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is
+  equal to the corresponding element of the second tensor.
+}
+```
+
 **EXP**
 
 ```
@@ -491,6 +500,19 @@ Options {
 }
 ```
 
+**NOT_EQUAL**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: a tensor of type bool, true whenever an element of the first tensor is not
+  equal to the corresponding element of the second tensor.
+}
+```
+
 **RELU**
 
 ```
diff --git a/tensorflow/contrib/lite/kernels/comparisons.cc b/tensorflow/contrib/lite/kernels/comparisons.cc
index 3b81062cd4..f678f48fa5 100644
--- a/tensorflow/contrib/lite/kernels/comparisons.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons.cc
@@ -23,6 +23,7 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 namespace comparisons {
+namespace {
 
 constexpr int kInputTensor1 = 0;
 constexpr int kInputTensor2 = 1;
@@ -67,6 +68,57 @@ TfLiteStatus ComparisonPrepare(TfLiteContext* context, TfLiteNode* node) {
             GetTensorData<type>(input2), GetTensorDims(input2), \
             GetTensorData<bool>(output), GetTensorDims(output));
 
+TfLiteStatus EqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, Equal, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, Equal, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, Equal, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+// TODO(renjieliu): Refactor the logic to avoid duplications.
+TfLiteStatus NotEqualEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  bool requires_broadcast = !HaveSameShapes(input1, input2);
+  // TODO(renjieliu): Support quantized data.
+  switch (input1->type) {
+    case kTfLiteFloat32:
+      TF_LITE_COMPARISON(float, NotEqual, requires_broadcast);
+      break;
+    case kTfLiteInt32:
+      TF_LITE_COMPARISON(int32_t, NotEqual, requires_broadcast);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_COMPARISON(int64_t, NotEqual, requires_broadcast);
+      break;
+    default:
+      context->ReportError(context,
+                           "Does not support type %d, requires float|int",
+                           input1->type);
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
 TfLiteStatus GreaterEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
   const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
@@ -167,8 +219,22 @@ TfLiteStatus LessEqualEval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+}  // namespace
 }  // namespace comparisons
 
+TfLiteRegistration* Register_EQUAL() {
+  static TfLiteRegistration r = {
+      nullptr, nullptr, comparisons::ComparisonPrepare, comparisons::EqualEval};
+  return &r;
+}
+
+TfLiteRegistration* Register_NOT_EQUAL() {
+  static TfLiteRegistration r = {nullptr, nullptr,
+                                 comparisons::ComparisonPrepare,
+                                 comparisons::NotEqualEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_GREATER() {
   static TfLiteRegistration r = {nullptr, nullptr,
                                  comparisons::ComparisonPrepare,
diff --git a/tensorflow/contrib/lite/kernels/comparisons_test.cc b/tensorflow/contrib/lite/kernels/comparisons_test.cc
index 835d238d36..bb02e1c812 100644
--- a/tensorflow/contrib/lite/kernels/comparisons_test.cc
+++ b/tensorflow/contrib/lite/kernels/comparisons_test.cc
@@ -21,18 +21,17 @@ limitations under the License.
 namespace tflite {
 namespace {
 
-using ::testing::ElementsAreArray;
+using ::testing::ElementsAre;
 
-class GreaterOpModel : public SingleOpModel {
+class ComparisonOpModel : public SingleOpModel {
  public:
-  GreaterOpModel(std::initializer_list<int> input1_shape,
-                 std::initializer_list<int> input2_shape,
-                 TensorType input_type) {
+  ComparisonOpModel(std::initializer_list<int> input1_shape,
+                    std::initializer_list<int> input2_shape,
+                    TensorType input_type, BuiltinOperator op) {
     input1_ = AddInput(input_type);
     input2_ = AddInput(input_type);
     output_ = AddOutput(TensorType_BOOL);
-    SetBuiltinOp(BuiltinOperator_GREATER, BuiltinOptions_GreaterOptions,
-                 CreateGreaterOptions(builder_).Union());
+    ConfigureBuiltinOp(op);
     BuildInterpreter({input1_shape, input2_shape});
   }
 
@@ -46,245 +45,313 @@ class GreaterOpModel : public SingleOpModel {
   int input1_;
   int input2_;
   int output_;
+
+  void ConfigureBuiltinOp(BuiltinOperator op) {
+    switch (op) {
+      case BuiltinOperator_EQUAL: {
+        SetBuiltinOp(op, BuiltinOptions_EqualOptions,
+                     CreateEqualOptions(builder_).Union());
+        break;
+      }
+      case BuiltinOperator_NOT_EQUAL: {
+        SetBuiltinOp(op, BuiltinOptions_NotEqualOptions,
+                     CreateNotEqualOptions(builder_).Union());
+        break;
+      }
+      case BuiltinOperator_GREATER: {
+        SetBuiltinOp(op, BuiltinOptions_GreaterOptions,
+                     CreateGreaterOptions(builder_).Union());
+        break;
+      }
+      case BuiltinOperator_GREATER_EQUAL: {
+        SetBuiltinOp(op, BuiltinOptions_GreaterEqualOptions,
+                     CreateGreaterEqualOptions(builder_).Union());
+        break;
+      }
+      case BuiltinOperator_LESS: {
+        SetBuiltinOp(op, BuiltinOptions_LessOptions,
+                     CreateLessOptions(builder_).Union());
+        break;
+      }
+      case BuiltinOperator_LESS_EQUAL: {
+        SetBuiltinOp(op, BuiltinOptions_LessEqualOptions,
+                     CreateLessEqualOptions(builder_).Union());
+        break;
+      }
+      default: { FAIL() << "We shouldn't get here."; }
+    }
+  }
 };
 
-TEST(ComparisonsTest, GreaterFloat) {
-  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+TEST(ComparisonsTest, EqualFloat) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
+                          BuiltinOperator_EQUAL);
   model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
   model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
-TEST(ComparisonsTest, GreaterInt) {
-  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+TEST(ComparisonsTest, EqualInt) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
-TEST(ComparisonsTest, GreaterBroadcast) {
-  GreaterOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+TEST(ComparisonsTest, EqualBroadcast) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
+                          BuiltinOperator_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {7});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, false, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
-TEST(ComparisonsTest, GreaterBroadcastTwoD) {
-  GreaterOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+TEST(ComparisonsTest, EqualBroadcastTwoD) {
+  ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
   model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false,
-                                                   false, true, false, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, false, false,
+                                             false, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
-class GreaterEqualOpModel : public SingleOpModel {
- public:
-  GreaterEqualOpModel(std::initializer_list<int> input1_shape,
-                      std::initializer_list<int> input2_shape,
-                      TensorType input_type) {
-    input1_ = AddInput(input_type);
-    input2_ = AddInput(input_type);
-    output_ = AddOutput(TensorType_BOOL);
-    SetBuiltinOp(BuiltinOperator_GREATER_EQUAL,
-                 BuiltinOptions_GreaterEqualOptions,
-                 CreateGreaterEqualOptions(builder_).Union());
-    BuildInterpreter({input1_shape, input2_shape});
-  }
+TEST(ComparisonsTest, NotEqualFloat) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
 
-  int input1() { return input1_; }
-  int input2() { return input2_; }
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
 
-  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
+TEST(ComparisonsTest, NotEqualInt) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
 
- private:
-  int input1_;
-  int input2_;
-  int output_;
-};
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(ComparisonsTest, NotEqualBroadcast) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(ComparisonsTest, NotEqualBroadcastTwoD) {
+  ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_NOT_EQUAL);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(true, true, true, true, true, true, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
+}
+
+TEST(ComparisonsTest, GreaterFloat) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
+                          BuiltinOperator_GREATER);
+  model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
+  model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(ComparisonsTest, GreaterInt) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_GREATER);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(ComparisonsTest, GreaterBroadcast) {
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
+                          BuiltinOperator_GREATER);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
+  model.PopulateTensor<int>(model.input2(), {7});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, false, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
+}
+
+TEST(ComparisonsTest, GreaterBroadcastTwoD) {
+  ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_GREATER);
+  model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
+  model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
+  model.Invoke();
+
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(false, true, true, false, false, true, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
+}
 
 TEST(ComparisonsTest, GreaterEqualFloat) {
-  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
+                          BuiltinOperator_GREATER_EQUAL);
   model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
   model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, true, true, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, GreaterEqualInt) {
-  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_GREATER_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, GreaterEqualBroadcast) {
-  GreaterEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
+                          BuiltinOperator_GREATER_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {7});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, true, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, GreaterEqualBroadcastTwoD) {
-  GreaterEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_GREATER_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
   model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, true, true, false,
-                                                   false, true, true, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(false, true, true, false, false, true, true, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
-class LessOpModel : public SingleOpModel {
- public:
-  LessOpModel(std::initializer_list<int> input1_shape,
-              std::initializer_list<int> input2_shape, TensorType input_type) {
-    input1_ = AddInput(input_type);
-    input2_ = AddInput(input_type);
-    output_ = AddOutput(TensorType_BOOL);
-    SetBuiltinOp(BuiltinOperator_LESS, BuiltinOptions_LessOptions,
-                 CreateLessOptions(builder_).Union());
-    BuildInterpreter({input1_shape, input2_shape});
-  }
-
-  int input1() { return input1_; }
-  int input2() { return input2_; }
-
-  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- private:
-  int input1_;
-  int input2_;
-  int output_;
-};
 
 TEST(ComparisonsTest, LessFloat) {
-  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
+                          BuiltinOperator_LESS);
   model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
   model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({false, false, false, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(false, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, LessInt) {
-  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_LESS);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {1, 2, 6, 5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, LessBroadcast) {
-  LessOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
+                          BuiltinOperator_LESS);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {7});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, LessBroadcastTwoD) {
-  LessOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_LESS);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 6, 8});
   model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
-                                                   true, false, false, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(true, false, false, true, true, false, false, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
-class LessEqualOpModel : public SingleOpModel {
- public:
-  LessEqualOpModel(std::initializer_list<int> input1_shape,
-                   std::initializer_list<int> input2_shape,
-                   TensorType input_type) {
-    input1_ = AddInput(input_type);
-    input2_ = AddInput(input_type);
-    output_ = AddOutput(TensorType_BOOL);
-    SetBuiltinOp(BuiltinOperator_LESS_EQUAL, BuiltinOptions_LessEqualOptions,
-                 CreateLessEqualOptions(builder_).Union());
-    BuildInterpreter({input1_shape, input2_shape});
-  }
-
-  int input1() { return input1_; }
-  int input2() { return input2_; }
-
-  std::vector<bool> GetOutput() { return ExtractVector<bool>(output_); }
-  std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
-
- private:
-  int input1_;
-  int input2_;
-  int output_;
-};
-
 TEST(ComparisonsTest, LessEqualFloat) {
-  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_FLOAT32,
+                          BuiltinOperator_LESS_EQUAL);
   model.PopulateTensor<float>(model.input1(), {0.1, 0.9, 0.7, 0.3});
   model.PopulateTensor<float>(model.input2(), {0.1, 0.2, 0.6, 0.5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, false, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, LessEqualInt) {
-  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_LESS_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {1, 2, 7, 5});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, LessEqualBroadcast) {
-  LessEqualOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 1, 4}, {1, 1, 1, 1}, TensorType_INT32,
+                          BuiltinOperator_LESS_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3});
   model.PopulateTensor<int>(model.input2(), {7});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, true, true}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 1, 4}));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(true, false, true, true));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 1, 4));
 }
 
 TEST(ComparisonsTest, LessEqualBroadcastTwoD) {
-  LessEqualOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32);
+  ComparisonOpModel model({1, 1, 2, 4}, {1, 1, 1, 4}, TensorType_INT32,
+                          BuiltinOperator_LESS_EQUAL);
   model.PopulateTensor<int>(model.input1(), {-1, 9, 7, 3, 2, 4, 2, 8});
   model.PopulateTensor<int>(model.input2(), {7, 1, 2, 4});
   model.Invoke();
 
-  EXPECT_THAT(model.GetOutput(), ElementsAreArray({true, false, false, true,
-                                                   true, false, true, false}));
-  EXPECT_THAT(model.GetOutputShape(), ElementsAreArray({1, 1, 2, 4}));
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAre(true, false, false, true, true, false, true, false));
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(1, 1, 2, 4));
 }
 
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index ca5a20ad4f..0b644a1fa6 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3865,6 +3865,16 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+template <typename T>
+inline bool EqualFn(T lhs, T rhs) {
+  return lhs == rhs;
+}
+
+template <typename T>
+inline bool NotEqualFn(T lhs, T rhs) {
+  return lhs != rhs;
+}
+
 template <typename T>
 inline bool GreaterFn(T lhs, T rhs) {
   return lhs > rhs;
@@ -4028,6 +4038,8 @@ inline void BroadcastComparison(int left_shift, const T* input1_data,
                                      input2_offset, input2_multiplier,        \
                                      input2_shift, output_data, output_dims); \
   }
+TFLITE_COMPARISON_OP(Equal);
+TFLITE_COMPARISON_OP(NotEqual);
 TFLITE_COMPARISON_OP(Greater);
 TFLITE_COMPARISON_OP(GreaterEqual);
 TFLITE_COMPARISON_OP(Less);
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 184b02dcec..6c68bb2f31 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -93,6 +93,8 @@ TfLiteRegistration* Register_SIN();
 TfLiteRegistration* Register_TRANSPOSE_CONV();
 TfLiteRegistration* Register_EXPAND_DIMS();
 TfLiteRegistration* Register_SPARSE_TO_DENSE();
+TfLiteRegistration* Register_EQUAL();
+TfLiteRegistration* Register_NOT_EQUAL();
 
 BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
@@ -168,6 +170,8 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_TILE, Register_TILE());
   AddBuiltin(BuiltinOperator_EXPAND_DIMS, Register_EXPAND_DIMS());
   AddBuiltin(BuiltinOperator_SPARSE_TO_DENSE, Register_SPARSE_TO_DENSE());
+  AddBuiltin(BuiltinOperator_EQUAL, Register_EQUAL());
+  AddBuiltin(BuiltinOperator_NOT_EQUAL, Register_NOT_EQUAL());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 8d8d74adfb..d78b6eae90 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -689,6 +689,8 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_GREATER_EQUAL:
     case BuiltinOperator_LESS:
     case BuiltinOperator_LESS_EQUAL:
+    case BuiltinOperator_EQUAL:
+    case BuiltinOperator_NOT_EQUAL:
     case BuiltinOperator_SELECT: {
       break;
     }
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index d27ab0c033..605ce7d6fc 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -494,6 +494,8 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_TILE:
       case tflite::BuiltinOperator_EXPAND_DIMS:
       case tflite::BuiltinOperator_SPARSE_TO_DENSE:
+      case tflite::BuiltinOperator_EQUAL:
+      case tflite::BuiltinOperator_NOT_EQUAL:
         FATAL("Op code %d is currently not delegated to NNAPI", builtin);
         nn_op_type = -1;  // set to invalid
         break;
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index 7dbb36c864..d12a96df1c 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -148,6 +148,8 @@ enum BuiltinOperator : byte {
   SPARSE_TO_DENSE = 68,
   TILE = 69,
   EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
 }
 
 // Options for the builtin operators.
@@ -204,6 +206,8 @@ union BuiltinOptions {
   SparseToDenseOptions,
   TileOptions,
   ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -478,6 +482,12 @@ table SparseToDenseOptions {
   validate_indices:bool;
 }
 
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index b1beb39b28..8ddd2f1438 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -187,6 +187,12 @@ struct ExpandDimsOptionsT;
 struct SparseToDenseOptions;
 struct SparseToDenseOptionsT;
 
+struct EqualOptions;
+struct EqualOptionsT;
+
+struct NotEqualOptions;
+struct NotEqualOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -317,11 +323,13 @@ enum BuiltinOperator {
   BuiltinOperator_SPARSE_TO_DENSE = 68,
   BuiltinOperator_TILE = 69,
   BuiltinOperator_EXPAND_DIMS = 70,
+  BuiltinOperator_EQUAL = 71,
+  BuiltinOperator_NOT_EQUAL = 72,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_EXPAND_DIMS
+  BuiltinOperator_MAX = BuiltinOperator_NOT_EQUAL
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[70] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[72] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -392,7 +400,9 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[70] {
     BuiltinOperator_TRANSPOSE_CONV,
     BuiltinOperator_SPARSE_TO_DENSE,
     BuiltinOperator_TILE,
-    BuiltinOperator_EXPAND_DIMS
+    BuiltinOperator_EXPAND_DIMS,
+    BuiltinOperator_EQUAL,
+    BuiltinOperator_NOT_EQUAL
   };
   return values;
 }
@@ -470,6 +480,8 @@ inline const char **EnumNamesBuiltinOperator() {
     "SPARSE_TO_DENSE",
     "TILE",
     "EXPAND_DIMS",
+    "EQUAL",
+    "NOT_EQUAL",
     nullptr
   };
   return names;
@@ -534,11 +546,13 @@ enum BuiltinOptions {
   BuiltinOptions_SparseToDenseOptions = 50,
   BuiltinOptions_TileOptions = 51,
   BuiltinOptions_ExpandDimsOptions = 52,
+  BuiltinOptions_EqualOptions = 53,
+  BuiltinOptions_NotEqualOptions = 54,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_ExpandDimsOptions
+  BuiltinOptions_MAX = BuiltinOptions_NotEqualOptions
 };
 
-inline BuiltinOptions (&EnumValuesBuiltinOptions())[53] {
+inline BuiltinOptions (&EnumValuesBuiltinOptions())[55] {
   static BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -592,7 +606,9 @@ inline BuiltinOptions (&EnumValuesBuiltinOptions())[53] {
     BuiltinOptions_TransposeConvOptions,
     BuiltinOptions_SparseToDenseOptions,
     BuiltinOptions_TileOptions,
-    BuiltinOptions_ExpandDimsOptions
+    BuiltinOptions_ExpandDimsOptions,
+    BuiltinOptions_EqualOptions,
+    BuiltinOptions_NotEqualOptions
   };
   return values;
 }
@@ -652,6 +668,8 @@ inline const char **EnumNamesBuiltinOptions() {
     "SparseToDenseOptions",
     "TileOptions",
     "ExpandDimsOptions",
+    "EqualOptions",
+    "NotEqualOptions",
     nullptr
   };
   return names;
@@ -874,6 +892,14 @@ template<> struct BuiltinOptionsTraits<ExpandDimsOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ExpandDimsOptions;
 };
 
+template<> struct BuiltinOptionsTraits<EqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_EqualOptions;
+};
+
+template<> struct BuiltinOptionsTraits<NotEqualOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_NotEqualOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1321,6 +1347,22 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_ExpandDimsOptions ?
       reinterpret_cast<const ExpandDimsOptionsT *>(value) : nullptr;
   }
+  EqualOptionsT *AsEqualOptions() {
+    return type == BuiltinOptions_EqualOptions ?
+      reinterpret_cast<EqualOptionsT *>(value) : nullptr;
+  }
+  const EqualOptionsT *AsEqualOptions() const {
+    return type == BuiltinOptions_EqualOptions ?
+      reinterpret_cast<const EqualOptionsT *>(value) : nullptr;
+  }
+  NotEqualOptionsT *AsNotEqualOptions() {
+    return type == BuiltinOptions_NotEqualOptions ?
+      reinterpret_cast<NotEqualOptionsT *>(value) : nullptr;
+  }
+  const NotEqualOptionsT *AsNotEqualOptions() const {
+    return type == BuiltinOptions_NotEqualOptions ?
+      reinterpret_cast<const NotEqualOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -4781,6 +4823,86 @@ inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(
 
 flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flatbuffers::FlatBufferBuilder &_fbb, const SparseToDenseOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct EqualOptionsT : public flatbuffers::NativeTable {
+  typedef EqualOptions TableType;
+  EqualOptionsT() {
+  }
+};
+
+struct EqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef EqualOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  EqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(EqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<EqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct EqualOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit EqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  EqualOptionsBuilder &operator=(const EqualOptionsBuilder &);
+  flatbuffers::Offset<EqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<EqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<EqualOptions> CreateEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  EqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<EqualOptions> CreateEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct NotEqualOptionsT : public flatbuffers::NativeTable {
+  typedef NotEqualOptions TableType;
+  NotEqualOptionsT() {
+  }
+};
+
+struct NotEqualOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef NotEqualOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  NotEqualOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(NotEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<NotEqualOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct NotEqualOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit NotEqualOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  NotEqualOptionsBuilder &operator=(const NotEqualOptionsBuilder &);
+  flatbuffers::Offset<NotEqualOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<NotEqualOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  NotEqualOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -5068,6 +5190,12 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const ExpandDimsOptions *builtin_options_as_ExpandDimsOptions() const {
     return builtin_options_type() == BuiltinOptions_ExpandDimsOptions ? static_cast<const ExpandDimsOptions *>(builtin_options()) : nullptr;
   }
+  const EqualOptions *builtin_options_as_EqualOptions() const {
+    return builtin_options_type() == BuiltinOptions_EqualOptions ? static_cast<const EqualOptions *>(builtin_options()) : nullptr;
+  }
+  const NotEqualOptions *builtin_options_as_NotEqualOptions() const {
+    return builtin_options_type() == BuiltinOptions_NotEqualOptions ? static_cast<const NotEqualOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -5302,6 +5430,14 @@ template<> inline const ExpandDimsOptions *Operator::builtin_options_as<ExpandDi
   return builtin_options_as_ExpandDimsOptions();
 }
 
+template<> inline const EqualOptions *Operator::builtin_options_as<EqualOptions>() const {
+  return builtin_options_as_EqualOptions();
+}
+
+template<> inline const NotEqualOptions *Operator::builtin_options_as<NotEqualOptions>() const {
+  return builtin_options_as_NotEqualOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -7196,6 +7332,52 @@ inline flatbuffers::Offset<SparseToDenseOptions> CreateSparseToDenseOptions(flat
       _validate_indices);
 }
 
+inline EqualOptionsT *EqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new EqualOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void EqualOptions::UnPackTo(EqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<EqualOptions> EqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<EqualOptions> CreateEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const EqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const EqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateEqualOptions(
+      _fbb);
+}
+
+inline NotEqualOptionsT *NotEqualOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new NotEqualOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void NotEqualOptions::UnPackTo(NotEqualOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<NotEqualOptions> NotEqualOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateNotEqualOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<NotEqualOptions> CreateNotEqualOptions(flatbuffers::FlatBufferBuilder &_fbb, const NotEqualOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const NotEqualOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateNotEqualOptions(
+      _fbb);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -7590,6 +7772,14 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const ExpandDimsOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const EqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const NotEqualOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -7816,6 +8006,14 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const ExpandDimsOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const EqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const NotEqualOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -8030,6 +8228,14 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const ExpandDimsOptionsT *>(value);
       return CreateExpandDimsOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<const EqualOptionsT *>(value);
+      return CreateEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<const NotEqualOptionsT *>(value);
+      return CreateNotEqualOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -8244,6 +8450,14 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new ExpandDimsOptionsT(*reinterpret_cast<ExpandDimsOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_EqualOptions: {
+      value = new EqualOptionsT(*reinterpret_cast<EqualOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      value = new NotEqualOptionsT(*reinterpret_cast<NotEqualOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -8511,6 +8725,16 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_EqualOptions: {
+      auto ptr = reinterpret_cast<EqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_NotEqualOptions: {
+      auto ptr = reinterpret_cast<NotEqualOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 351187f520..723b6ae057 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -2165,6 +2165,74 @@ def make_arg_max_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_equal_tests(zip_path):
+  """Make a set of tests to do equal."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the equal op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.equal(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_not_equal_tests(zip_path):
+  """Make a set of tests to do not equal."""
+
+  test_parameters = [{
+      "input_dtype": [tf.float32, tf.int32, tf.int64],
+      "input_shape_pair": [([1, 1, 1, 3], [1, 1, 1, 3]),
+                           ([2, 3, 4, 5], [2, 3, 4, 5]), ([2, 3, 3], [2, 3]),
+                           ([5, 5], [1]), ([10], [2, 4, 10])],
+  }]
+
+  def build_graph(parameters):
+    """Build the not euqal op testing graph."""
+    input_value1 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input1",
+        shape=parameters["input_shape_pair"][0])
+    input_value2 = tf.placeholder(
+        dtype=parameters["input_dtype"],
+        name="input2",
+        shape=parameters["input_shape_pair"][1])
+    out = tf.not_equal(input_value1, input_value2)
+    return [input_value1, input_value2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][0])
+    input_value2 = create_tensor_data(parameters["input_dtype"],
+                                      parameters["input_shape_pair"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_greater_tests(zip_path):
   """Make a set of tests to do greater."""
 
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 99f0c81a1b..76ce1c5802 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -1938,6 +1938,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertRandomUniformOperator(
         model, static_cast<const RandomUniformOperator&>(src_op),
         tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowEqual) {
+    ConvertComparisonOperator(model, src_op, "Equal", tensorflow_graph);
+  } else if (src_op.type == OperatorType::kTensorFlowNotEqual) {
+    ConvertComparisonOperator(model, src_op, "NotEqual", tensorflow_graph);
   } else if (src_op.type == OperatorType::kTensorFlowGreater) {
     ConvertComparisonOperator(model, src_op, "Greater", tensorflow_graph);
   } else if (src_op.type == OperatorType::kTensorFlowGreaterEqual) {
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
index 64096fb069..92d283ca2c 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -60,6 +60,8 @@ bool PropagateArrayDataTypes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowLessEqual:
     case OperatorType::kTensorFlowGreater:
     case OperatorType::kTensorFlowGreaterEqual:
+    case OperatorType::kTensorFlowEqual:
+    case OperatorType::kTensorFlowNotEqual:
       // These operators unconditionally produce bool outputs
       SetDataTypeForAllOutputs(model, op, ArrayDataType::kBool);
       break;
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index adb241da32..9e4262223e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -1563,6 +1563,8 @@ bool PropagateFixedSizes::Run(Model* model, std::size_t op_index) {
     case OperatorType::kTensorFlowMaximum:
     case OperatorType::kTensorFlowMinimum:
     case OperatorType::kTensorFlowGreaterEqual:
+    case OperatorType::kTensorFlowEqual:
+    case OperatorType::kTensorFlowNotEqual:
       ProcessSimpleBinaryOperator(model, op);
       break;
     case OperatorType::kAddN:
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index b9ebf66ff2..b13a88a9eb 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1908,6 +1908,12 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
     ConvertSimpleOperator<SelectOperator, 3>(node, tf_import_flags, model);
   } else if (node.op() == "SparseToDense") {
     ConvertSparseToDenseOperator(node, tf_import_flags, model);
+  } else if (node.op() == "Equal") {
+    ConvertSimpleOperator<TensorFlowEqualOperator, 2>(node, tf_import_flags,
+                                                      model);
+  } else if (node.op() == "NotEqual") {
+    ConvertSimpleOperator<TensorFlowNotEqualOperator, 2>(node, tf_import_flags,
+                                                         model);
   } else {
     ConvertUnsupportedOperator(node, tf_import_flags, model);
   }
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 1a4f87e363..81beb29372 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -136,6 +136,8 @@ enum class OperatorType {
   kReorderAxes,
   kSelect,
   kSparseToDense,
+  kTensorFlowEqual,
+  kTensorFlowNotEqual,
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -1358,6 +1360,22 @@ struct TensorFlowGreaterEqualOperator : Operator {
       : Operator(OperatorType::kTensorFlowGreaterEqual) {}
 };
 
+// TensorFlow Equal equivalent. Refer to TensorFlow documentation for
+// details.
+// Not fully supported, just a placeholder to handle TensorFlow graphs and
+// support graph transformations to other operator types by matching sub-graphs.
+// Typically, this is only used as an input to an Assert node, so can be
+// removed as an unused node as we drop Assert nodes.
+struct TensorFlowEqualOperator : Operator {
+  TensorFlowEqualOperator() : Operator(OperatorType::kTensorFlowEqual) {}
+};
+
+// TensorFlow Not Equal equivalent. Refer to TensorFlow documentation for
+// details.
+struct TensorFlowNotEqualOperator : Operator {
+  TensorFlowNotEqualOperator() : Operator(OperatorType::kTensorFlowNotEqual) {}
+};
+
 // Global max reduction: computes the max of all of entries in the input array.
 // Thus the output is "0-dimensional": it consists of a single scalar value.
 //
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index a8518adefc..8bfd76db6e 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -1118,6 +1118,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
   ops.emplace_back(
       new SimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice));
   ops.emplace_back(new SimpleOperator<SinOperator>("SIN", OperatorType::kSin));
+  ops.emplace_back(new SimpleOperator<TensorFlowEqualOperator>(
+      "EQUAL", OperatorType::kTensorFlowEqual));
+  ops.emplace_back(new SimpleOperator<TensorFlowNotEqualOperator>(
+      "NOT_EQUAL", OperatorType::kTensorFlowNotEqual));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index d63c99a5f9..06bbe53516 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -119,6 +119,10 @@ TEST_F(OperatorTest, SimpleOperators) {
   CheckSimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect);
   CheckSimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice);
   CheckSimpleOperator<SinOperator>("SIN", OperatorType::kSin);
+  CheckSimpleOperator<TensorFlowEqualOperator>("EQUAL",
+                                               OperatorType::kTensorFlowEqual);
+  CheckSimpleOperator<TensorFlowNotEqualOperator>(
+      "NOT_EQUAL", OperatorType::kTensorFlowNotEqual);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index fe7bed885d..5a82be3939 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -394,6 +394,8 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(DynamicStitch)
     HANDLE_OPERATORTYPENAME_CASE(Select)
     HANDLE_OPERATORTYPENAME_CASE(SparseToDense)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowEqual)
+    HANDLE_OPERATORTYPENAME_CASE(TensorFlowNotEqual)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
-- 
GitLab


From 3ddc925c8559f2989f3904f271f2d4175c2f3302 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 02:59:00 -0700
Subject: [PATCH 419/610] Improve performance of
 HloComputation::MakeInstructionPostOrder

Previously it used the same infrastructure as HloInstruction::Accept
what caused a high overhead for large models due to the excess amount of
work it have to do to support modifying the graph under iteration and due
to the lack of caching on graphs with multiple sinks.

The new code is a very simple implementation of an iterative DFS based
topological sort.

PiperOrigin-RevId: 199606688
---
 .../compiler/xla/service/hlo_computation.cc   | 43 +++++++++++++++++--
 1 file changed, 40 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index b61eabbbf5..ed0ea39ff5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -315,12 +315,49 @@ void ComputeComputationPostOrder(
   }
 }
 
+std::list<HloInstruction*> ComputeInstructionPostOrder(
+    HloInstruction* root, tensorflow::gtl::FlatSet<HloInstruction*>* visited) {
+  std::list<HloInstruction*> post_order;
+  std::vector<std::pair<HloInstruction*, bool>> dfs_stack;
+  dfs_stack.emplace_back(root, false);
+  while (!dfs_stack.empty()) {
+    const auto current = dfs_stack.back();
+    if (current.second) {
+      dfs_stack.pop_back();
+      if (!visited->insert(current.first).second) {
+        continue;
+      }
+      post_order.push_back(current.first);
+    } else {
+      if (visited->count(current.first)) {
+        dfs_stack.pop_back();
+        continue;
+      }
+      dfs_stack.back().second = true;
+
+      // Add the operands to the stack in reverse order so the first operand is
+      // processed first. This will produce a more natural ordering and a nicer
+      // result for thigns like HLO stringification.
+      const auto& operands = current.first->operands();
+      for (int64 i = operands.size() - 1; i >= 0; --i) {
+        dfs_stack.emplace_back(operands[i], false);
+      }
+
+      for (HloInstruction* op : current.first->control_predecessors()) {
+        dfs_stack.emplace_back(op, false);
+      }
+    }
+  }
+  return post_order;
+}
+
 }  // namespace
 
 std::list<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
   std::list<HloInstruction*> post_order;
   std::list<HloInstruction*> trace_instructions;
   tensorflow::gtl::FlatSet<HloInstruction*> added_instructions;
+  std::vector<HloInstruction> dfs_stack;
   for (auto& instruction : instructions_) {
     if (instruction->opcode() == HloOpcode::kTrace) {
       // Trace instructions aren't handled by the DFS visitor. Add trace
@@ -328,9 +365,9 @@ std::list<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
       // users).
       trace_instructions.push_back(instruction.get());
     } else if (instruction->users().empty()) {
-      post_order.splice(post_order.end(),
-                        InstructionPostOrderer::GetOrder(instruction.get(),
-                                                         &added_instructions));
+      post_order.splice(
+          post_order.end(),
+          ComputeInstructionPostOrder(instruction.get(), &added_instructions));
     }
   }
   post_order.splice(post_order.end(), trace_instructions);
-- 
GitLab


From fcc3282497d42ae842e25abe4fd904fb7a1cfd2a Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Thu, 7 Jun 2018 04:58:34 -0700
Subject: [PATCH 420/610] Update revision of clang in download scripts

PiperOrigin-RevId: 199617749
---
 third_party/clang_toolchain/download_clang.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 02d2b78067..a203245005 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -35,18 +35,18 @@ def download_clang(repo_ctx, out_folder):
 
   # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
   # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-  CLANG_REVISION = '332335'
+  CLANG_REVISION = '332838'
   CLANG_SUB_REVISION = 1
 
   package_version = '%s-%s' % (CLANG_REVISION, CLANG_SUB_REVISION)
 
   checksums = {
       'Linux_x64':
-          '5c234e0bc43b2386984ac34ac9c200c35686f2f7fa5ded0db031055bbc7f3e52',
+          'b9ef55de7500778f366039dbe62d1632074a3ef3673022eabf4e59d405730968',
       'Mac':
-          '69b94f16d261c0922c3853cdad768776f454dece2948363f1c4e20bc2ddbf95d',
+          '30d808512763c98cecf15f7bb654d845de3e8d065a95f5c5b6b3459254cc98d6',
       'Win':
-          '76c8897abf032f3e23598275517da60090f53cf35b673481f41fa98752d1ad37',
+          '277e799a190b22727c26b09986c0cedbd667a189f425318f421addf6a21ca4bd',
   }
 
   platform_folder = _get_platform_folder(repo_ctx.os.name)
-- 
GitLab


From 54773fd243ccae28bc8f935440cf87a4d4f4519f Mon Sep 17 00:00:00 2001
From: James Keeling <jtkeeling@google.com>
Date: Thu, 7 Jun 2018 05:22:38 -0700
Subject: [PATCH 421/610] Add GetAllRegisteredKernels helper

There was already a function to LOG(INFO) all of these kernels, but not to get
the protos themselves.

PiperOrigin-RevId: 199619906
---
 tensorflow/core/framework/op_kernel.cc      | 10 ++++++++++
 tensorflow/core/framework/op_kernel.h       |  4 ++++
 tensorflow/core/framework/op_kernel_test.cc | 22 +++++++++++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index b05a9df7c1..ce213a63be 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -1120,6 +1120,16 @@ void LogAllRegisteredKernels() {
   }
 }
 
+std::vector<KernelDef> GetAllRegisteredKernels() {
+  const KernelRegistry* const typed_registry = GlobalKernelRegistryTyped();
+  std::vector<KernelDef> kernels;
+  kernels.reserve(typed_registry->size());
+  for (const auto& p : *typed_registry) {
+    kernels.emplace_back(p.second.def);
+  }
+  return kernels;
+}
+
 string KernelsRegisteredForOp(StringPiece op_name) {
   string ret;
   for (const auto& key_registration : *GlobalKernelRegistryTyped()) {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index f577664709..5ebe6976fd 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/control_flow.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"  // TODO(b/62899350): Remove
@@ -1303,6 +1304,9 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
 // missing kernel errors.
 void LogAllRegisteredKernels();
 
+// Gets a vector of all registered kernels.
+std::vector<KernelDef> GetAllRegisteredKernels();
+
 namespace kernel_factory {
 
 class OpKernelRegistrar {
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index bcd409e5c5..50319ca576 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -964,5 +964,27 @@ void BM_SelectInputRange(int iters) {
 BENCHMARK(BM_ConcatInputRange);
 BENCHMARK(BM_SelectInputRange);
 
+TEST(RegisteredKernels, CanCallGetAllRegisteredKernels) {
+  auto all_registered_kernels = GetAllRegisteredKernels();
+  auto has_name_test1 = [](const KernelDef& k) { return k.op() == "Test1"; };
+
+  // Verify we can find the "Test1" op registered above
+  auto test1_it = std::find_if(all_registered_kernels.begin(),
+                               all_registered_kernels.end(), has_name_test1);
+  ASSERT_NE(test1_it, all_registered_kernels.end());
+  EXPECT_EQ(test1_it->device_type(), "CPU");
+
+  // Verify there was just one kernel
+  ++test1_it;
+  EXPECT_EQ(
+      std::find_if(test1_it, all_registered_kernels.end(), has_name_test1),
+      all_registered_kernels.end());
+}
+
+// Simple test just to check we can call LogAllRegisteredKernels
+TEST(RegisteredKernels, CanLogAllRegisteredKernels) {
+  tensorflow::LogAllRegisteredKernels();
+}
+
 }  // namespace
 }  // namespace tensorflow
-- 
GitLab


From 3ccae103ce22e0758da92bef914e1bd289de2d86 Mon Sep 17 00:00:00 2001
From: Karan Kaw <karankaw@hotmail.com>
Date: Thu, 7 Jun 2018 18:37:24 +0530
Subject: [PATCH 422/610] Mentioned proper DLL name

---
 tensorflow/docs_src/install/install_java.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/docs_src/install/install_java.md b/tensorflow/docs_src/install/install_java.md
index fcc1a85b6b..3ec0cd5ee2 100644
--- a/tensorflow/docs_src/install/install_java.md
+++ b/tensorflow/docs_src/install/install_java.md
@@ -181,7 +181,7 @@ Take the following steps to install TensorFlow for Java on Windows:
      [TensorFlow for Java on Windows](https://storage.googleapis.com/tensorflow/libtensorflow/libtensorflow_jni-cpu-windows-x86_64-1.8.0.zip).
   3. Extract this .zip file.
 
-__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp...dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
+__Note__: The native library (`tensorflow_jni.dll`) requires `msvcp140.dll` at runtime, which is included in the [Visual C++ 2015 Redistributable](https://www.microsoft.com/en-us/download/details.aspx?id=48145) package. 
 
 ### Validate the installation
 
-- 
GitLab


From 4b3c9fea4355bf9094bdaeb2476f5959b33c2ffa Mon Sep 17 00:00:00 2001
From: Adria Puigdomenech <adriap@google.com>
Date: Thu, 7 Jun 2018 06:07:09 -0700
Subject: [PATCH 423/610] Implement scatter_nd_add for resource variables.

PiperOrigin-RevId: 199623738
---
 .../api_def_ResourceScatterNdAdd.pbtxt        | 69 ++++++++++++++++++
 .../api_def_ResourceScatterNdAdd.pbtxt        |  4 ++
 .../python_api/api_def_ScatterNdAdd.pbtxt     |  4 ++
 tensorflow/core/kernels/scatter_nd_op.cc      |  4 +-
 tensorflow/core/ops/state_ops.cc              |  9 +++
 .../resource_variable_ops_test.py             | 10 +++
 tensorflow/python/ops/state_ops.py            | 72 +++++++++++++++++--
 7 files changed, 166 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ResourceScatterNdAdd.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_ScatterNdAdd.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
new file mode 100644
index 0000000000..3b3a274df5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -0,0 +1,69 @@
+op {
+  graph_op_name: "ResourceScatterNdAdd"
+  in_arg {
+    name: "ref"
+    description: <<END
+A resource handle. Must be from a VarHandleOp.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A Tensor. Must be one of the following types: int32, int64.
+A tensor of indices into ref.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+A Tensor. Must have the same type as ref. A tensor of
+values to add to ref.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+An optional bool. Defaults to True. If True, the assignment will
+be protected by a lock; otherwise the behavior is undefined,
+but may exhibit less contention.
+END
+  }
+  summary: "Adds sparse `updates` to individual values or slices within a given"
+  description: <<END
+variable according to `indices`.
+
+`ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+`indices` must be integer tensor, containing indices into `ref`.
+It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+The innermost dimension of `indices` (with length `K`) corresponds to
+indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+dimension of `ref`.
+
+`updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+```
+[d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+```
+
+For example, say we want to update 4 scattered elements to a rank-1 tensor to
+8 elements. In Python, that update would look like this:
+
+```python
+    ref = tfe.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+    indices = tf.constant([[4], [3], [1] ,[7]])
+    updates = tf.constant([9, 10, 11, 12])
+    update = tf.scatter_nd_add(ref, indices, updates)
+    with tf.Session() as sess:
+      print sess.run(update)
+```
+
+The resulting update to ref would look like this:
+
+    [1, 12, 3, 14, 14, 6, 7, 20]
+
+See @{tf.scatter_nd} for more details about how to make updates to
+slices.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdAdd.pbtxt
new file mode 100644
index 0000000000..ffef3ab522
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceScatterNdAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceScatterNdAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScatterNdAdd.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScatterNdAdd.pbtxt
new file mode 100644
index 0000000000..f6c8af5c33
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ScatterNdAdd.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ScatterNdAdd"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 8ef6e77398..bdc268cf49 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -260,7 +260,9 @@ class ScatterNdUpdateOp : public OpKernel {
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdNonAliasingAdd", \
                                     scatter_nd_op::UpdateOp::ADD);        \
   REGISTER_SCATTER_ND_UPDATE_KERNEL(type, dev, "ScatterNdSub",            \
-                                    scatter_nd_op::UpdateOp::SUB);
+                                    scatter_nd_op::UpdateOp::SUB);        \
+  REGISTER_RESOURCE_SCATTER_ND_UPDATE_KERNEL(                             \
+      type, dev, "ResourceScatterNdAdd", scatter_nd_op::UpdateOp::ADD);
 
 #define REGISTER_SCATTER_ND(type, dev) \
   REGISTER_SCATTER_ND_KERNEL(type, dev, "ScatterNd");
diff --git a/tensorflow/core/ops/state_ops.cc b/tensorflow/core/ops/state_ops.cc
index 664f52452e..aa975cb77b 100644
--- a/tensorflow/core/ops/state_ops.cc
+++ b/tensorflow/core/ops/state_ops.cc
@@ -222,6 +222,15 @@ REGISTER_OP("ResourceScatterNdUpdate")
     .Attr("use_locking: bool = true")
     .SetShapeFn(shape_inference::ScatterNdUpdateShape);
 
+REGISTER_OP("ResourceScatterNdAdd")
+    .Input("ref: resource")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = true")
+    .SetShapeFn(shape_inference::ScatterNdUpdateShape);
+
 REGISTER_OP("ScatterNdAdd")
     .Input("ref: Ref(T)")
     .Input("indices: Tindices")
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index 00d517e64e..82e0d153c2 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -822,6 +822,16 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       state_ops.scatter_add(v, [1], [3])
       self.assertAllEqual([1.0, 5.0], v.numpy())
 
+  def testScatterNdAddStateOps(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable(
+          [1, 1, 1, 1, 1, 1, 1, 1], dtype=dtypes.float32, name="add")
+      indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
+      updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+      expected = np.array([1, 12, 1, 11, 10, 1, 1, 13])
+      state_ops.scatter_nd_add(v, indices, updates)
+      self.assertAllClose(expected, v.numpy())
+
   def testScatterUpdateCast(self):
     with context.eager_mode():
       v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="update")
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 94d7458ec8..08b7cda73b 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -338,7 +338,6 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
   Args:
     ref: A Variable.
     indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
-      A Tensor. Must be one of the following types: int32, int64.
       A tensor of indices into ref.
     updates: A `Tensor`. Must have the same type as `ref`.
       A Tensor. Must have the same type as ref. A tensor of updated
@@ -355,10 +354,9 @@ def scatter_nd_update(ref, indices, updates, use_locking=True, name=None):
   if ref.dtype._is_ref_dtype:
     return gen_state_ops.scatter_nd_update(
         ref, indices, updates, use_locking, name)
-  with ops.control_dependencies([gen_state_ops.resource_scatter_nd_update(
-      ref.handle, indices, ops.convert_to_tensor(updates, dtype=ref.dtype),
-      use_locking, name)]):
-    return ref.read_value()
+  return ref._lazy_read(gen_state_ops.resource_scatter_nd_update(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
 
 
 @tf_export("scatter_add")
@@ -411,3 +409,67 @@ def scatter_add(ref, indices, updates, use_locking=False, name=None):
   return ref._lazy_read(gen_resource_variable_ops.resource_scatter_add(  # pylint: disable=protected-access
       ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
       name=name))
+
+
+@tf_export("scatter_nd_add")
+def scatter_nd_add(ref, indices, updates, use_locking=False, name=None):
+  r"""Applies sparse addition to individual values or slices in a Variable.
+
+  `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+
+  `indices` must be integer tensor, containing indices into `ref`.
+  It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
+
+  The innermost dimension of `indices` (with length `K`) corresponds to
+  indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
+  dimension of `ref`.
+
+  `updates` is `Tensor` of rank `Q-1+P-K` with shape:
+
+  ```
+  [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+  ```
+
+  For example, say we want to add 4 scattered elements to a rank-1 tensor to
+  8 elements. In Python, that update would look like this:
+
+  ```python
+      ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+      indices = tf.constant([[4], [3], [1] ,[7]])
+      updates = tf.constant([9, 10, 11, 12])
+      add = tf.scatter_nd_add(ref, indices, updates)
+      with tf.Session() as sess:
+        print sess.run(add)
+  ```
+
+  The resulting update to ref would look like this:
+
+      [1, 13, 3, 14, 14, 6, 7, 20]
+
+  See @{tf.scatter_nd} for more details about how to make updates to
+  slices.
+
+  Args:
+    ref: A mutable `Tensor`. Must be one of the following types: `float32`,
+      `float64`, `int32`, `uint8`, `int16`, `int8`, `complex64`, `int64`,
+      `qint8`, `quint8`, `qint32`, `bfloat16`, `uint16`, `complex128`, `half`,
+      `uint32`, `uint64`. A mutable Tensor. Should be from a Variable node.
+    indices: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      A tensor of indices into ref.
+    updates: A `Tensor`. Must have the same type as `ref`.
+      A tensor of updated values to add to ref.
+    use_locking: An optional `bool`. Defaults to `True`.
+      An optional bool. Defaults to True. If True, the assignment will
+      be protected by a lock; otherwise the behavior is undefined,
+      but may exhibit less contention.
+    name: A name for the operation (optional).
+
+  Returns:
+    A mutable `Tensor`. Has the same type as `ref`.
+  """
+  if ref.dtype._is_ref_dtype:
+    return gen_state_ops.scatter_nd_add(
+        ref, indices, updates, use_locking, name)
+  return ref._lazy_read(gen_state_ops.resource_scatter_nd_add(  # pylint: disable=protected-access
+      ref.handle, indices, ops.convert_to_tensor(updates, ref.dtype),
+      name=name))
-- 
GitLab


From 866bc315e4c05159227ae2dabcead31d8e58e725 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 07:21:15 -0700
Subject: [PATCH 424/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 199631126
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 37 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 37 +++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 16e9b2e02e..1b4bec7bc8 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -48133,6 +48133,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 7df43663c9..1dfaeeabad 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -23631,6 +23631,43 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceScatterNdAdd"
+  input_arg {
+    name: "ref"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceScatterNdUpdate"
   input_arg {
-- 
GitLab


From 3f31670ddc140a62ffac9d8b9310f71bdfbae629 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 07:45:14 -0700
Subject: [PATCH 425/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 199633475

---
 tensorflow/go/op/wrappers.go | 758 +++++++++++++++++------------------
 1 file changed, 379 insertions(+), 379 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 550ef8944d..6fc7087cb1 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2892,6 +2892,28 @@ func DiagPart(scope *Scope, input tf.Output) (diagonal tf.Output) {
 	return op.Output(0)
 }
 
+// Gives a guarantee to the TF runtime that the input tensor is a constant.
+//
+// The runtime is then free to make optimizations based on this.
+//
+// Only accepts value typed tensors as inputs and rejects resource variable handles
+// as input.
+//
+// Returns the input tensor without modification.
+func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GuaranteeConst",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a sequence of numbers.
 //
 // This operation creates a sequence of numbers that begins at `start` and
@@ -7457,6 +7479,36 @@ func RandomPoisson(scope *Scope, shape tf.Output, rate tf.Output, optional ...Ra
 	return op.Output(0)
 }
 
+// Returns the element-wise sum of a list of tensors.
+//
+// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
+// wait for all of its inputs to be ready before beginning to sum. This can
+// save memory if inputs are ready at different times, since minimum temporary
+// storage is proportional to the output size rather than the inputs size.
+//
+// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+//
+// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+//
+// Arguments:
+//	inputs: A list of `Tensor` objects, each with same shape and type.
+//	shape: Shape of elements of `inputs`.
+func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"shape": shape}
+	opspec := tf.OpSpec{
+		Type: "AccumulateNV2",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
@@ -8326,95 +8378,6 @@ func RestoreSlice(scope *Scope, file_pattern tf.Output, tensor_name tf.Output, s
 	return op.Output(0)
 }
 
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Imag",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
-//
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Divides sparse updates into the variable referenced by `resource`.
 //
 // This operation computes
@@ -8456,6 +8419,23 @@ func ResourceScatterDiv(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Mutually reduces multiple tensors of identical type and shape.
+func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
+	opspec := tf.OpSpec{
+		Type: "CollectiveReduce",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // StatelessRandomNormalAttr is an optional argument to StatelessRandomNormal.
 type StatelessRandomNormalAttr func(optionalAttr)
 
@@ -11174,63 +11154,6 @@ func DeserializeSparse(scope *Scope, serialized_sparse tf.Output, dtype tf.DataT
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // ResourceScatterNdUpdateAttr is an optional argument to ResourceScatterNdUpdate.
 type ResourceScatterNdUpdateAttr func(optionalAttr)
 
@@ -11759,23 +11682,6 @@ func TensorArrayGatherV3(scope *Scope, handle tf.Output, indices tf.Output, flow
 	return op.Output(0)
 }
 
-// Mutually reduces multiple tensors of identical type and shape.
-func CollectiveReduce(scope *Scope, input tf.Output, group_size int64, group_key int64, instance_key int64, merge_op string, final_op string, subdiv_offsets []int64) (data tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"group_size": group_size, "group_key": group_key, "instance_key": instance_key, "merge_op": merge_op, "final_op": final_op, "subdiv_offsets": subdiv_offsets}
-	opspec := tf.OpSpec{
-		Type: "CollectiveReduce",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // This op consumes a lock created by `MutexLock`.
 //
 // This op exists to consume a tensor created by `MutexLock` (other than
@@ -11877,81 +11783,6 @@ func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and
 	return tensors
 }
 
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SkipDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Receives a tensor value broadcast from another device.
 func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
 	if scope.Err() != nil {
@@ -13665,6 +13496,170 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Creates a dataset that skips `count` elements from the `input_dataset`.
+//
+// Arguments:
+//
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
+//
+//
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "SkipDataset",
+		Input: []tf.Input{
+			input_dataset, count,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
 // The Hurwitz zeta function is defined as:
@@ -13875,23 +13870,59 @@ func ResourceApplyCenteredRMSPropUseLocking(value bool) ResourceApplyCenteredRMS
 //	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
 //
-// Returns the created operation.
-func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+// Returns the created operation.
+func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyCenteredRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyCenteredRMSProp",
+		Input: []tf.Input{
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Computes the gradient for the inverse of `x` wrt its input.
+//
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+//
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyCenteredRMSProp",
+		Type: "Minimum",
 		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad,
+			x, y,
 		},
-		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
 // RealAttr is an optional argument to Real.
@@ -16287,6 +16318,63 @@ func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, value_dtype tf.D
 	return op.Output(0)
 }
 
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Returns element-wise remainder of division. This emulates C semantics in that
 //
 // the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
@@ -19194,88 +19282,58 @@ func SquaredDifference(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Forwards the input to the output.
-//
-// This operator represents the loop termination condition used by the
-// "pivot" switches of a loop.
-//
-// Arguments:
-//	input: A boolean scalar, representing the branch predicate of the Switch op.
-//
-// Returns The same tensor as `input`.
-func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "LoopCond",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
+// RandomGammaAttr is an optional argument to RandomGamma.
+type RandomGammaAttr func(optionalAttr)
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// RandomGammaSeed sets the optional seed attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomGammaSeed(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// RandomGammaSeed2 sets the optional seed2 attribute to value.
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Minimum",
-		Input: []tf.Input{
-			x, y,
-		},
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomGammaSeed2(value int64) RandomGammaAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Returns the element-wise sum of a list of tensors.
-//
-// `tf.accumulate_n_v2` performs the same operation as `tf.add_n`, but does not
-// wait for all of its inputs to be ready before beginning to sum. This can
-// save memory if inputs are ready at different times, since minimum temporary
-// storage is proportional to the output size rather than the inputs size.
-//
-// Unlike the original `accumulate_n`, `accumulate_n_v2` is differentiable.
+// Outputs random values from the Gamma distribution(s) described by alpha.
 //
-// Returns a `Tensor` of same shape and type as the elements of `inputs`.
+// This op uses the algorithm by Marsaglia et al. to acquire samples via
+// transformation-rejection from pairs of uniform and normal random variables.
+// See http://dl.acm.org/citation.cfm?id=358414
 //
 // Arguments:
-//	inputs: A list of `Tensor` objects, each with same shape and type.
-//	shape: Shape of elements of `inputs`.
-func AccumulateNV2(scope *Scope, inputs []tf.Output, shape tf.Shape) (sum tf.Output) {
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in alpha.
+//	alpha: A tensor in which each scalar is a "shape" parameter describing the
+// associated gamma distribution.
+//
+// Returns A tensor with shape `shape + shape(alpha)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
+func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "AccumulateNV2",
+		Type: "RandomGamma",
 		Input: []tf.Input{
-			tf.OutputList(inputs),
+			shape, alpha,
 		},
 		Attrs: attrs,
 	}
@@ -19331,60 +19389,24 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// RandomGammaAttr is an optional argument to RandomGamma.
-type RandomGammaAttr func(optionalAttr)
-
-// RandomGammaSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomGammaSeed(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomGammaSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomGammaSeed2(value int64) RandomGammaAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from the Gamma distribution(s) described by alpha.
+// Forwards the input to the output.
 //
-// This op uses the algorithm by Marsaglia et al. to acquire samples via
-// transformation-rejection from pairs of uniform and normal random variables.
-// See http://dl.acm.org/citation.cfm?id=358414
+// This operator represents the loop termination condition used by the
+// "pivot" switches of a loop.
 //
 // Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in alpha.
-//	alpha: A tensor in which each scalar is a "shape" parameter describing the
-// associated gamma distribution.
+//	input: A boolean scalar, representing the branch predicate of the Switch op.
 //
-// Returns A tensor with shape `shape + shape(alpha)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `alpha[i0, i1, ...iN]`. The dtype of the output matches the dtype of alpha.
-func RandomGamma(scope *Scope, shape tf.Output, alpha tf.Output, optional ...RandomGammaAttr) (output tf.Output) {
+// Returns The same tensor as `input`.
+func LoopCond(scope *Scope, input tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "RandomGamma",
+		Type: "LoopCond",
 		Input: []tf.Input{
-			shape, alpha,
+			input,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -30688,25 +30710,3 @@ func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output
 	}
 	return output
 }
-
-// Gives a guarantee to the TF runtime that the input tensor is a constant.
-//
-// The runtime is then free to make optimizations based on this.
-//
-// Only accepts value typed tensors as inputs and rejects resource variable handles
-// as input.
-//
-// Returns the input tensor without modification.
-func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "GuaranteeConst",
-		Input: []tf.Input{
-			input,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-- 
GitLab


From 537e8c7a28b6b793eb570c957c4e90bf81ce9c3b Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Thu, 7 Jun 2018 08:47:36 -0700
Subject: [PATCH 426/610] Remove _USE_C_API staging from session.py.

PiperOrigin-RevId: 199641205
---
 tensorflow/python/client/session.py | 159 +++++++---------------------
 1 file changed, 39 insertions(+), 120 deletions(-)

diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 5507d011bb..648e35cdf2 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -619,21 +619,12 @@ class BaseSession(SessionInterface):
       self._config = None
       self._add_shapes = False
 
-    # pylint: disable=protected-access
-    # We cache _USE_C_API's value because some test cases will create a session
-    # with _USE_C_API = False but set it back to True before calling close().
-    self._created_with_new_api = ops._USE_C_API
-    # pylint: enable=protected-access
-
     self._session = None
     opts = tf_session.TF_NewSessionOptions(target=self._target, config=config)
     try:
-      if self._created_with_new_api:
-        # pylint: disable=protected-access
-        self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
-        # pylint: enable=protected-access
-      else:
-        self._session = tf_session.TF_NewDeprecatedSession(opts)
+      # pylint: disable=protected-access
+      self._session = tf_session.TF_NewSession(self._graph._c_graph, opts)
+      # pylint: enable=protected-access
     finally:
       tf_session.TF_DeleteSessionOptions(opts)
 
@@ -660,11 +651,7 @@ class BaseSession(SessionInterface):
     Returns:
       A list of devices in the session.
     """
-    if self._created_with_new_api:
-      raw_device_list = tf_session.TF_SessionListDevices(self._session)
-    else:
-      raw_device_list = tf_session.TF_DeprecatedSessionListDevices(
-          self._session)
+    raw_device_list = tf_session.TF_SessionListDevices(self._session)
     device_list = []
     size = tf_session.TF_DeviceListCount(raw_device_list)
     for i in range(size):
@@ -684,16 +671,9 @@ class BaseSession(SessionInterface):
       tf.errors.OpError: Or one of its subclasses if an error occurs while
         closing the TensorFlow session.
     """
-    if self._created_with_new_api:
-      if self._session and not self._closed:
-        self._closed = True
-        tf_session.TF_CloseSession(self._session)
-
-    else:
-      with self._extend_lock:
-        if self._opened and not self._closed:
-          self._closed = True
-          tf_session.TF_CloseDeprecatedSession(self._session)
+    if self._session and not self._closed:
+      self._closed = True
+      tf_session.TF_CloseSession(self._session)
 
   def __del__(self):
     # cleanly ignore all exceptions
@@ -703,10 +683,7 @@ class BaseSession(SessionInterface):
       pass
     if self._session is not None:
       try:
-        if self._created_with_new_api:
-          tf_session.TF_DeleteSession(self._session)
-        else:
-          tf_session.TF_DeleteDeprecatedSession(self._session)
+        tf_session.TF_DeleteSession(self._session)
       except AttributeError:
         # At shutdown, `c_api_util` or `tf_session` may have been garbage
         # collected, causing the above method calls to fail. In this case,
@@ -1005,12 +982,9 @@ class BaseSession(SessionInterface):
         try:
           subfeed_t = self.graph.as_graph_element(
               subfeed, allow_tensor=True, allow_operation=False)
-          if self._created_with_new_api:
-            # pylint: disable=protected-access
-            feed_list.append(subfeed_t._as_tf_output())
-            # pylint: enable=protected-access
-          else:
-            feed_list.append(compat.as_bytes(subfeed_t.name))
+          # pylint: disable=protected-access
+          feed_list.append(subfeed_t._as_tf_output())
+          # pylint: enable=protected-access
         except Exception as e:
           e.message = ('Cannot interpret feed_list key as Tensor: ' + e.message)
           e.args = (e.message,)
@@ -1023,22 +997,13 @@ class BaseSession(SessionInterface):
     # Set up a graph with feeds and fetches for partial run.
     def _setup_fn(session, feed_list, fetch_list, target_list):
       self._extend_graph()
-      if self._created_with_new_api:
-        return tf_session.TF_SessionPRunSetup_wrapper(
-            session, feed_list, fetch_list, target_list)
-      else:
-        with errors.raise_exception_on_not_ok_status() as status:
-          return tf_session.TF_PRunSetup(session, feed_list, fetch_list,
-                                         target_list, status)
+      return tf_session.TF_SessionPRunSetup_wrapper(
+          session, feed_list, fetch_list, target_list)
 
-    if self._created_with_new_api:
-      # pylint: disable=protected-access
-      final_fetches = [t._as_tf_output() for t in fetch_handler.fetches()]
-      final_targets = [op._c_op for op in fetch_handler.targets()]
-      # pylint: enable=protected-access
-    else:
-      final_fetches = _name_list(fetch_handler.fetches())
-      final_targets = _name_list(fetch_handler.targets())
+    # pylint: disable=protected-access
+    final_fetches = [t._as_tf_output() for t in fetch_handler.fetches()]
+    final_targets = [op._c_op for op in fetch_handler.targets()]
+    # pylint: enable=protected-access
 
     return self._do_call(_setup_fn, self._session, feed_list, final_fetches,
                          final_targets)
@@ -1196,14 +1161,10 @@ class BaseSession(SessionInterface):
 
     # Create a fetch handler to take care of the structure of fetches.
     fetch_handler = _FetchHandler(self._graph, fetches, {})
-    if self._created_with_new_api:
-      # pylint: disable=protected-access
-      fetch_list = [t._as_tf_output() for t in fetch_handler.fetches()]
-      target_list = [op._c_op for op in fetch_handler.targets()]
-      # pylint: enable=protected-access
-    else:
-      fetch_list = _name_list(fetch_handler.fetches())
-      target_list = _name_list(fetch_handler.targets())
+    # pylint: disable=protected-access
+    fetch_list = [t._as_tf_output() for t in fetch_handler.fetches()]
+    target_list = [op._c_op for op in fetch_handler.targets()]
+    # pylint: enable=protected-access
 
     def _callable_template_with_options_and_metadata(fetch_list,
                                                      target_list,
@@ -1289,16 +1250,11 @@ class BaseSession(SessionInterface):
     Raises:
       tf.errors.OpError: Or one of its subclasses on error.
     """
-    if self._created_with_new_api:
-      # pylint: disable=protected-access
-      feeds = dict((t._as_tf_output(), v) for t, v in feed_dict.items())
-      fetches = [t._as_tf_output() for t in fetch_list]
-      targets = [op._c_op for op in target_list]
-      # pylint: enable=protected-access
-    else:
-      feeds = dict((compat.as_bytes(t.name), v) for t, v in feed_dict.items())
-      fetches = _name_list(fetch_list)
-      targets = _name_list(target_list)
+    # pylint: disable=protected-access
+    feeds = dict((t._as_tf_output(), v) for t, v in feed_dict.items())
+    fetches = [t._as_tf_output() for t in fetch_list]
+    targets = [op._c_op for op in target_list]
+    # pylint: enable=protected-access
 
     def _run_fn(feed_dict, fetch_list, target_list, options, run_metadata):
       # Ensure any changes to the graph are reflected in the runtime.
@@ -1335,22 +1291,8 @@ class BaseSession(SessionInterface):
       raise type(e)(node_def, op, message)
 
   def _extend_graph(self):
-    if self._created_with_new_api:
-      with self._graph._lock:  # pylint: disable=protected-access
-        tf_session.ExtendSession(self._session)
-    else:
-      # Ensure any changes to the graph are reflected in the runtime.
-      with self._extend_lock:
-        if self._graph.version > self._current_version:
-          # pylint: disable=protected-access
-          graph_def, self._current_version = self._graph._as_graph_def(
-              from_version=self._current_version, add_shapes=self._add_shapes)
-          # pylint: enable=protected-access
-
-          with errors.raise_exception_on_not_ok_status() as status:
-            tf_session.TF_ExtendGraph(self._session,
-                                      graph_def.SerializeToString(), status)
-          self._opened = True
+    with self._graph._lock:  # pylint: disable=protected-access
+      tf_session.ExtendSession(self._session)
 
   # The threshold to run garbage collection to delete dead tensors.
   _DEAD_HANDLES_THRESHOLD = 10
@@ -1403,24 +1345,13 @@ class BaseSession(SessionInterface):
 
   def _call_tf_sessionrun(self, options, feed_dict, fetch_list, target_list,
                           run_metadata):
-    if self._created_with_new_api:
-      return tf_session.TF_SessionRun_wrapper(
-          self._session, options, feed_dict, fetch_list, target_list,
-          run_metadata)
-    else:
-      with errors.raise_exception_on_not_ok_status() as status:
-        return tf_session.TF_Run(
-            self._session, options, feed_dict, fetch_list, target_list,
-            status, run_metadata)
+    return tf_session.TF_SessionRun_wrapper(
+        self._session, options, feed_dict, fetch_list, target_list,
+        run_metadata)
 
   def _call_tf_sessionprun(self, handle, feed_dict, fetch_list):
-    if self._created_with_new_api:
-      return tf_session.TF_SessionPRun_wrapper(
-          self._session, handle, feed_dict, fetch_list)
-    else:
-      with errors.raise_exception_on_not_ok_status() as status:
-        return tf_session.TF_PRun(
-            self._session, handle, feed_dict, fetch_list, status)
+    return tf_session.TF_SessionPRun_wrapper(
+        self._session, handle, feed_dict, fetch_list)
 
   # pylint: disable=protected-access
   class _Callable(object):
@@ -1433,12 +1364,8 @@ class BaseSession(SessionInterface):
           compat.as_bytes(callable_options.SerializeToString()))
       try:
         with errors.raise_exception_on_not_ok_status() as status:
-          if session._created_with_new_api:
-            self._handle = tf_session.TF_SessionMakeCallable(
-                session._session, options_ptr, status)
-          else:
-            self._handle = tf_session.TF_DeprecatedSessionMakeCallable(
-                session._session, options_ptr, status)
+          self._handle = tf_session.TF_SessionMakeCallable(
+              session._session, options_ptr, status)
       finally:
         tf_session.TF_DeleteBuffer(options_ptr)
 
@@ -1446,12 +1373,8 @@ class BaseSession(SessionInterface):
       # TODO(b/74355905): Support argument and return value nested structures,
       # and tensor-like objects such as SparseTensors.
       with errors.raise_exception_on_not_ok_status() as status:
-        if self._session._created_with_new_api:
-          return tf_session.TF_SessionRunCallable(
-              self._session._session, self._handle, args, status, None)
-        else:
-          return tf_session.TF_DeprecatedSessionRunCallable(
-              self._session._session, self._handle, args, status, None)
+        return tf_session.TF_SessionRunCallable(
+            self._session._session, self._handle, args, status, None)
 
     def __del__(self):
       # NOTE(mrry): It is possible that `self._session.__del__()` could be
@@ -1459,12 +1382,8 @@ class BaseSession(SessionInterface):
       # will be `None`.
       if self._handle is not None and self._session._session is not None:
         with errors.raise_exception_on_not_ok_status() as status:
-          if self._session._created_with_new_api:
-            tf_session.TF_SessionReleaseCallable(
-                self._session._session, self._handle, status)
-          else:
-            tf_session.TF_DeprecatedSessionReleaseCallable(
-                self._session._session, self._handle, status)
+          tf_session.TF_SessionReleaseCallable(
+              self._session._session, self._handle, status)
   # pylint: enable=protected-access
 
   # TODO(b/74355905): Reimplement `Session.make_callable()` using this method
-- 
GitLab


From f66782cacfefedf638dc845d83629057f6d57059 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 08:54:49 -0700
Subject: [PATCH 427/610] Add convolution and convolution1d to the public API

PiperOrigin-RevId: 199642103
---
 tensorflow/contrib/layers/__init__.py         |  2 ++
 .../contrib/layers/python/layers/layers.py    |  8 +++----
 .../layers/python/layers/layers_test.py       | 23 +++++++++++++++++++
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/layers/__init__.py b/tensorflow/contrib/layers/__init__.py
index 00f03a111a..bc33596935 100644
--- a/tensorflow/contrib/layers/__init__.py
+++ b/tensorflow/contrib/layers/__init__.py
@@ -19,6 +19,8 @@ See the @{$python/contrib.layers} guide.
 @@avg_pool2d
 @@avg_pool3d
 @@batch_norm
+@@convolution
+@@convolution1d
 @@convolution2d
 @@convolution3d
 @@conv2d_in_plane
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index b7194ae333..b6d63c9640 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -57,10 +57,10 @@ from tensorflow.python.training import moving_averages
 __all__ = [
     'avg_pool2d', 'avg_pool3d', 'batch_norm', 'bias_add', 'conv2d', 'conv3d',
     'conv2d_in_plane', 'conv2d_transpose', 'conv3d_transpose', 'convolution',
-    'convolution2d', 'convolution2d_in_plane', 'convolution2d_transpose',
-    'convolution3d', 'convolution3d_transpose', 'dense_to_sparse',
-    'dropout', 'elu', 'flatten', 'fully_connected', 'GDN', 'gdn',
-    'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
+    'convolution1d', 'convolution2d', 'convolution2d_in_plane',
+    'convolution2d_transpose', 'convolution3d', 'convolution3d_transpose',
+    'dense_to_sparse', 'dropout', 'elu', 'flatten', 'fully_connected', 'GDN',
+    'gdn', 'images_to_sequence', 'layer_norm', 'linear', 'pool', 'max_pool2d',
     'max_pool3d', 'one_hot_encoding', 'relu', 'relu6', 'repeat',
     'scale_gradient', 'separable_conv2d', 'separable_convolution2d',
     'sequence_to_images', 'softmax', 'spatial_softmax', 'stack', 'unit_norm',
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index b01fd5d5c9..0e8c89fe3a 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1312,6 +1312,29 @@ class ConvolutionInPlaneTest(test.TestCase):
 
       self.assertAllClose(result, expected, rtol=1e-5, atol=1e-5)
 
+  def testConv1dShape(self):
+    width = 7
+    with self.test_session():
+      images = random_ops.random_uniform((5, width, 3), seed=1)
+      output = layers_lib.convolution1d(images, 32, 3)
+      self.assertEqual(output.op.name, 'Conv/Relu')
+      self.assertListEqual(output.get_shape().as_list(), [5, width, 32])
+
+  def testConvInferSpatialDims(self):
+    depth, height, width = 7, 9, 11
+    with self.test_session():
+      images = np.random.uniform(size=(5, width, 4)).astype(np.float32)
+      output = layers_lib.convolution(images, 32, [3])
+      self.assertListEqual(output.get_shape().as_list(), [5, width, 32])
+      images = np.random.uniform(size=(5, height, width, 4)).astype(np.float32)
+      output = layers_lib.convolution(images, 32, [3, 3])
+      self.assertListEqual(output.get_shape().as_list(), [5, height, width, 32])
+      images = np.random.uniform(size=(5, depth, height, width,
+                                       4)).astype(np.float32)
+      output = layers_lib.convolution(images, 32, [3, 3, 3])
+      self.assertListEqual(output.get_shape().as_list(),
+                           [5, depth, height, width, 32])
+
 
 class DenseToSparseTest(test.TestCase):
 
-- 
GitLab


From 93fc61ea54bbf17c7dbae189b331ce6acb44904d Mon Sep 17 00:00:00 2001
From: tucan <37643248+tucan9389@users.noreply.github.com>
Date: Fri, 8 Jun 2018 00:59:03 +0900
Subject: [PATCH 428/610] Update CONTRIBUTING.md (#19794)

Just update clang-tidy to `clang-tidy`.
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8669c25c45..db4b1581ae 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -90,7 +90,7 @@ Bazel BUILD files also need to include a license section, e.g.,
 Changes to TensorFlow C++ code should conform to
 [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
 
-Use `clang-tidy` to check your C/C++ changes. To install clang-tidy on ubuntu:16.04, do:
+Use `clang-tidy` to check your C/C++ changes. To install `clang-tidy` on ubuntu:16.04, do:
 
 ```bash
 apt-get install -y clang-tidy
-- 
GitLab


From 086d96aea3d6b3272b2746359e13f4156072ff8b Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Thu, 7 Jun 2018 09:20:57 -0700
Subject: [PATCH 429/610] Fix bug due to incorrect nesting of return statement
 in eager iterator evaluation.

PiperOrigin-RevId: 199645638
---
 .../python/keras/engine/training_eager.py     | 10 ++--
 .../keras/engine/training_eager_test.py       | 54 +++++++++++++++++++
 2 files changed, 59 insertions(+), 5 deletions(-)

diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 46e0e2b476..15a7b0c0f2 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -501,11 +501,11 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
     if verbose == 1:
       progbar.update(step_index + 1)
 
-    for i in range(len(outs)):
-      outs[i] /= num_samples
-    if len(outs) == 1:
-      return outs[0]
-    return outs
+  for i in range(len(outs)):
+    outs[i] /= num_samples
+  if len(outs) == 1:
+    return outs[0]
+  return outs
 
 
 def batch_test_loop(model,
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index d9446fd437..7906d208eb 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python import keras
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util as tf_test_util
@@ -670,6 +671,59 @@ class CorrectnessTest(test.TestCase):
     outs = model.evaluate(x, y)
     self.assertEqual(outs[1], 0.)
 
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_loss_correctness_with_iterator(self):
+    # Test that training loss is the same in eager and graph
+    # (by comparing it to a reference value in a deterministic case)
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Dense(
+            3, activation='relu', input_dim=4, kernel_initializer='ones'))
+    model.add(
+        keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
+    model.compile(
+        loss='sparse_categorical_crossentropy',
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+    x = np.ones((100, 4), dtype=np.float32)
+    np.random.seed(123)
+    y = np.random.randint(0, 1, size=(100, 1))
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    history = model.fit(iterator, epochs=1, steps_per_epoch=10)
+    self.assertEqual(np.around(history.history['loss'][-1], decimals=4), 0.6173)
+
+  @tf_test_util.run_in_graph_and_eager_modes()
+  def test_metrics_correctness_with_iterator(self):
+    model = keras.Sequential()
+    model.add(
+        keras.layers.Dense(
+            8, activation='relu', input_dim=4, kernel_initializer='ones'))
+    model.add(
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['accuracy'],
+        optimizer=RMSPropOptimizer(learning_rate=0.001))
+    np.random.seed(123)
+    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+
+    y = np.zeros((100, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset.make_one_shot_iterator()
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(outs[1], 0.)
+
+
 if __name__ == '__main__':
   ops.enable_eager_execution()
   test.main()
-- 
GitLab


From bf1ab06311f9506f69479af47a19dd1a901bdde1 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Thu, 7 Jun 2018 09:33:06 -0700
Subject: [PATCH 430/610] Allow replace_expression to generate simple names,
 nor just Expr nodes. Ensure it also resolves names, like replace.

PiperOrigin-RevId: 199647339
---
 tensorflow/contrib/autograph/pyct/templates.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tensorflow/contrib/autograph/pyct/templates.py b/tensorflow/contrib/autograph/pyct/templates.py
index baf7923fff..9c479ebc2f 100644
--- a/tensorflow/contrib/autograph/pyct/templates.py
+++ b/tensorflow/contrib/autograph/pyct/templates.py
@@ -239,8 +239,13 @@ def replace_as_expression(template, **replacements):
     raise ValueError(
         'single expression expected; for more general templates use replace')
   node = replacement[0]
-  if not isinstance(node, gast.Expr):
-    raise ValueError(
-        'the template is expected to generate an expression node; instead '
-        'found %s' % node)
-  return node.value
+  node = qual_names.resolve(node)
+
+  if isinstance(node, gast.Expr):
+    return node.value
+  elif isinstance(node, gast.Name):
+    return node
+
+  raise ValueError(
+      'the template is expected to generate an expression or a name node;'
+      ' instead found %s' % node)
-- 
GitLab


From bff89b698d4e53f6f2f242ac9562bd1f0f12a5c2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 09:36:35 -0700
Subject: [PATCH 431/610] Typos in documentation and style improvements in
 tests.

PiperOrigin-RevId: 199647791
---
 tensorflow/python/ops/math_ops.py             |  6 +++---
 .../python/ops/special_math_ops_test.py       | 21 +++++++------------
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 3a31ef7f88..b4cedb1d46 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -370,7 +370,7 @@ def erf(x, name=None):
   """Computes the Gauss error function of `x` element-wise.
 
   Args:
-    x: A `Tensor` of `SparseTensor`. Must be one of the following types: `half`,
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
       `float32`, `float64`.
     name: A name for the operation (optional).
 
@@ -2225,8 +2225,8 @@ def sigmoid(x, name=None):
   Returns:
     A Tensor with the same type as `x`.
 
-  @compatibility(numpy)
-  Equivalent to np.scipy.special.expit
+  @compatibility(scipy)
+  Equivalent to scipy.special.expit
   @end_compatibility
   """
   with ops.name_scope(name, "Sigmoid", [x]) as name:
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index d7c3a7e8dc..6118b54293 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -285,8 +285,8 @@ class EinsumTest(test.TestCase):
     correct_value = np.einsum(axes, *input_vals)
 
     err = np.abs(correct_value - output_value).max()
-    print(axes, err)
-    assert err < 1e-8
+    # print(axes, err)
+    self.assertLess(err, 1e-8)
 
   def test_input_is_placeholder(self):
     with ops.Graph().as_default():
@@ -298,8 +298,7 @@ class EinsumTest(test.TestCase):
             m0: [[1, 2, 3]],
             m1: [[2], [1], [1]],
         }
-        np.testing.assert_almost_equal([[7]], sess.run(
-            out, feed_dict=feed_dict))
+        self.assertAllClose([[7]], sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 3))
@@ -310,7 +309,7 @@ class EinsumTest(test.TestCase):
             m0: [[1, 2, 3]],
             m1: [2, 1, 1],
         }
-        np.testing.assert_almost_equal([7], sess.run(out, feed_dict=feed_dict))
+        self.assertAllClose([7], sess.run(out, feed_dict=feed_dict))
 
     # Tests for placeholders which have two or more None values
     with ops.Graph().as_default():
@@ -322,8 +321,7 @@ class EinsumTest(test.TestCase):
             m0: [[[1, 2]]],
             m1: [[3], [2]],
         }
-        np.testing.assert_almost_equal([[[7]]],
-                                       sess.run(out, feed_dict=feed_dict))
+        self.assertAllClose([[[7]]], sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(2, 1))
@@ -334,8 +332,7 @@ class EinsumTest(test.TestCase):
             m0: [[3], [2]],
             m1: [[[1, 2]]],
         }
-        np.testing.assert_almost_equal([[[7]]],
-                                       sess.run(out, feed_dict=feed_dict))
+        self.assertAllClose([[[7]]], sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, None, 2))
@@ -346,8 +343,7 @@ class EinsumTest(test.TestCase):
             m0: [[[1, 2]]],
             m1: [3, 2],
         }
-        np.testing.assert_almost_equal([[7]], sess.run(
-            out, feed_dict=feed_dict))
+        self.assertAllClose([[7]], sess.run(out, feed_dict=feed_dict))
 
     with ops.Graph().as_default():
       m0 = array_ops.placeholder(dtypes.int32, shape=(None, 2, None, 2))
@@ -358,8 +354,7 @@ class EinsumTest(test.TestCase):
             m0: [[[[1, 2]], [[2, 1]]]],
             m1: [[3, 2]],
         }
-        np.testing.assert_almost_equal([[[7, 8]]],
-                                       sess.run(out, feed_dict=feed_dict))
+        self.assertAllClose([[[7, 8]]], sess.run(out, feed_dict=feed_dict))
 
 
 if __name__ == '__main__':
-- 
GitLab


From a3c46fc0fc519eaad0ac5331867cd097ad1a9d32 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 7 Jun 2018 09:49:18 -0700
Subject: [PATCH 432/610] Change unimplemented ops error message.

PiperOrigin-RevId: 199649736
---
 tensorflow/contrib/lite/toco/tflite/export.cc | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/tflite/export.cc b/tensorflow/contrib/lite/toco/tflite/export.cc
index 5daa703c80..a2d753657b 100644
--- a/tensorflow/contrib/lite/toco/tflite/export.cc
+++ b/tensorflow/contrib/lite/toco/tflite/export.cc
@@ -316,6 +316,7 @@ void Export(
   auto op_codes = ExportOperatorCodes(model, ops_by_type, operators_map,
                                       &builder, &error_summary);
   const string fake_quant_operation_name = "FAKE_QUANT";
+
   if (error_summary.count(fake_quant_operation_name) != 0) {
     LOG(ERROR)
         << fake_quant_operation_name
@@ -327,6 +328,21 @@ void Export(
     error_summary.erase(fake_quant_operation_name);
   }
   if (!allow_custom_ops && !error_summary.empty()) {
+    // Remove ExpandDims and ReorderAxes from unimplemented list unless they
+    // compose the list. Both ops are removed during graph transformations.
+    // However, if an op is unimplemented earlier in the model, the graph
+    // transformation is unable to run because the output shape is not defined.
+    // This causes unnecessary confusion during model conversion time.
+    std::set<string> error_summary_final;
+    for (const auto& op_type : error_summary) {
+      if (op_type != "ReorderAxes" && op_type != "ExpandDims") {
+        error_summary_final.insert(op_type);
+      }
+    }
+    if (error_summary_final.empty()) {
+      error_summary_final = error_summary;
+    }
+
     LOG(QFATAL)
         << "Some of the operators in the model are not supported by "
            "the standard TensorFlow Lite runtime. If you have a custom "
@@ -334,7 +350,7 @@ void Export(
            "--allow_custom_ops, or by setting allow_custom_ops=True "
            "when calling tf.contrib.lite.toco_convert(). Here is a list "
            "of operators for which  you will need custom implementations: "
-        << absl::StrJoin(error_summary, ", ") << ".";
+        << absl::StrJoin(error_summary_final, ", ") << ".";
   }
 
   auto ops =
-- 
GitLab


From 796fff865013f964e85c134dddf6f1f49574bd72 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Thu, 7 Jun 2018 09:59:20 -0700
Subject: [PATCH 433/610] [XLA:GPU] Fix non-const reduce init value generation
 to handle multi-output fusion

This was incorrectly trying to initialize the entire tuple output, which CHECK fails.

PiperOrigin-RevId: 199651315
---
 .../xla/service/gpu/ir_emitter_unnested.cc    | 20 +++++++----
 .../xla/tests/multioutput_fusion_test.cc      | 34 +++++++++++++++++++
 2 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 06fc3f8eea..ed005f6afc 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2557,13 +2557,19 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
 
   // Otherwise fall back to our slow initializer code.
   std::unique_ptr<KernelThunk> kernel_thunk = BuildKernelThunk(hlo);
-  TF_RETURN_IF_ERROR(EmitTargetElementLoopInThunk(
-      *hlo,
-      [=](const llvm_ir::IrArray::Index& index) {
-        return GetIrArray(*init_value, *hlo)
-            .EmitReadArrayElement(index, &ir_builder_);
-      },
-      kernel_thunk.get()));
+  LaunchDimensions launch_dimensions =
+      CalculateLaunchDimensions(ShapeUtil::GetSubshape(hlo->shape(), index),
+                                ir_emitter_context_->device_description());
+  UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
+                         ir_emitter_context_->llvm_module());
+  TF_RETURN_IF_ERROR(ParallelLoopEmitter(
+                         [=](const llvm_ir::IrArray::Index& index) {
+                           return GetIrArray(*init_value, *hlo)
+                               .EmitReadArrayElement(index, &ir_builder_);
+                         },
+                         GetIrArray(*hlo, *hlo, index), launch_dimensions,
+                         &ir_builder_)
+                         .EmitLoop(IrName(hlo)));
 
   // Clean up state left behind by emitting the loop above.  (This is normally
   // done in IrEmitterUnnested::Postprocess().)
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index f1d33a280d..41f723edf1 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -480,5 +480,39 @@ XLA_TEST_F(MultiOutputFusionTest,
               {{{5, 10}, {15, 20}}, {{25, 30}, {35, 40}}}))));
 }
 
+XLA_TEST_F(MultiOutputFusionTest,
+           DISABLED_ON_CPU(MultiOutputReduceFusionNonConstInit)) {
+  const string testcase = tensorflow::strings::StrCat(kScalarOps, R"(
+    fused_reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      init1 = f32[] parameter(1)
+      init2 = f32[] parameter(2)
+      r1 = f32[2,2]{1,0} reduce(p0, init1), dimensions={2}, to_apply=Add
+      r2 = f32[2,2]{1,0} reduce(p0, init2), dimensions={2}, to_apply=Max
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p = f32[2,2,2]{2,1,0} parameter(0)
+      i = f32[] parameter(1)
+      j = f32[] parameter(2)
+      ROOT fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(p, i, j), kind=kInput,
+                                                              calls=fused_reduce
+    })");
+  auto module =
+      HloRunner::CreateModuleFromString(testcase, GetDebugOptionsForTest())
+          .ValueOrDie();
+  auto param = Literal::CreateR3<float>({{{0, 2}, {3, 4}}, {{5, 6}, {7, 8}}});
+  auto init1 = Literal::CreateR0<float>(5);
+  auto init2 = Literal::CreateR0<float>(6);
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result,
+      Execute(std::move(module), {param.get(), init1.get(), init2.get()}));
+  EXPECT_TRUE(LiteralTestUtil::Equal(
+      *result, *Literal::MakeTupleOwned(
+                   Literal::CreateR2<float>({{167, 172}, {176, 180}}),
+                   Literal::CreateR2<float>({{6, 6}, {6, 8}}))));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From cd25a9544915654022e2cfff4923c31822166112 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 7 Jun 2018 10:38:50 -0700
Subject: [PATCH 434/610] Updated SavedModels in Python TOCO API.

PiperOrigin-RevId: 199658431
---
 tensorflow/contrib/lite/python/BUILD          |  3 +-
 .../lite/python/convert_saved_model.py        | 31 ++++++-------------
 tensorflow/contrib/lite/python/lite.py        |  2 +-
 tensorflow/contrib/lite/python/lite_test.py   |  2 +-
 .../contrib/lite/python/tflite_convert.py     |  2 +-
 5 files changed, 15 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/lite/python/BUILD b/tensorflow/contrib/lite/python/BUILD
index 7e6ff6c0a8..27909a9458 100644
--- a/tensorflow/contrib/lite/python/BUILD
+++ b/tensorflow/contrib/lite/python/BUILD
@@ -57,8 +57,9 @@ py_library(
         ":interpreter",
         ":lite_constants",
         ":op_hint",
-        "//tensorflow/contrib/saved_model:saved_model_py",
         "//tensorflow/python:graph_util",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/tools:freeze_graph_lib",
     ],
 )
diff --git a/tensorflow/contrib/lite/python/convert_saved_model.py b/tensorflow/contrib/lite/python/convert_saved_model.py
index 5dad49f1ed..1553464b9f 100644
--- a/tensorflow/contrib/lite/python/convert_saved_model.py
+++ b/tensorflow/contrib/lite/python/convert_saved_model.py
@@ -19,13 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.lite.python.convert import tensor_name
-from tensorflow.contrib.saved_model.python.saved_model import reader
-from tensorflow.contrib.saved_model.python.saved_model import signature_def_utils
 from tensorflow.core.framework import types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import graph_util as tf_graph_util
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import loader
 
 
@@ -58,21 +57,8 @@ def _get_meta_graph_def(saved_model_dir, tag_set):
   Raises:
     ValueError: No valid MetaGraphDef for given tag_set.
   """
-  saved_model = reader.read_saved_model(saved_model_dir)
-  tag_sets = []
-  result_meta_graph_def = None
-  for meta_graph_def in saved_model.meta_graphs:
-    meta_graph_tag_set = set(meta_graph_def.meta_info_def.tags)
-    tag_sets.append(meta_graph_tag_set)
-    if meta_graph_tag_set == tag_set:
-      result_meta_graph_def = meta_graph_def
-  logging.info("The given saved_model contains the following tags: %s",
-               tag_sets)
-  if result_meta_graph_def is not None:
-    return result_meta_graph_def
-  else:
-    raise ValueError("No valid MetaGraphDef for this tag_set '{}'. Possible "
-                     "values are '{}'. ".format(tag_set, tag_sets))
+  with session.Session(graph=ops.Graph()) as sess:
+    return loader.load(sess, tag_set, saved_model_dir)
 
 
 def _get_signature_def(meta_graph, signature_key):
@@ -97,9 +83,7 @@ def _get_signature_def(meta_graph, signature_key):
     raise ValueError("No '{}' in the SavedModel\'s SignatureDefs. Possible "
                      "values are '{}'.".format(signature_key,
                                                ",".join(signature_def_keys)))
-  signature_def = signature_def_utils.get_signature_def_by_key(
-      meta_graph, signature_key)
-  return signature_def
+  return signature_def_map[signature_key]
 
 
 def _get_inputs_outputs(signature_def):
@@ -247,6 +231,7 @@ def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
     ValueError:
       SavedModel doesn't contain a MetaGraphDef identified by tag_set.
       signature_key is not in the MetaGraphDef.
+      assets/ directory is in the MetaGraphDef.
       input_shapes does not match the length of input_arrays.
       input_arrays or output_arrays are not valid.
   """
@@ -255,9 +240,13 @@ def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
   signature_def = _get_signature_def(meta_graph, signature_key)
   inputs, outputs = _get_inputs_outputs(signature_def)
 
+  # Check SavedModel for assets directory.
+  collection_def = meta_graph.collection_def
+  if constants.ASSETS_KEY in collection_def:
+    raise ValueError("SavedModels with assets/ directory are not supported.")
+
   graph = ops.Graph()
   with session.Session(graph=graph) as sess:
-    # TODO(nupurgarg): Throw ValueError if SavedModel has assets/ directory.
     loader.load(sess, meta_graph.meta_info_def.tags, saved_model_dir)
 
     # Gets input and output tensors.
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 253e3f72b1..e3a2d19e05 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -207,7 +207,7 @@ class TocoConverter(object):
 
       # Check if graph is frozen.
       if not _is_frozen_graph(sess):
-        raise ValueError("Please freeze the graph using freeze_graph.py")
+        raise ValueError("Please freeze the graph using freeze_graph.py.")
 
       # Create TocoConverter class.
       return cls(sess.graph_def, input_tensors, output_tensors)
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index bbb00021f9..b04caaf263 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -401,7 +401,7 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError) as error:
       lite.TocoConverter.from_frozen_graph(graph_def_file, ['Placeholder'],
                                            ['add'])
-    self.assertEqual('Please freeze the graph using freeze_graph.py',
+    self.assertEqual('Please freeze the graph using freeze_graph.py.',
                      str(error.exception))
 
   def testPbtxt(self):
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 2b7ad29a27..4c215b62b2 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -114,7 +114,7 @@ def _convert_model(flags):
                        "--input_arrays must be present when specifying "
                        "--std_dev_values and --mean_values with multiple input "
                        "tensors in order to map between names and "
-                       "values".format(",".join(input_arrays)))
+                       "values.".format(",".join(input_arrays)))
     converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
   if flags.default_ranges_min and flags.default_ranges_max:
     converter.default_ranges_stats = (flags.default_ranges_min,
-- 
GitLab


From 1da05443167eebcfd31b8d00b2bb84dfceb84812 Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Thu, 7 Jun 2018 10:55:29 -0700
Subject: [PATCH 435/610] Handle tensor array grad only accessed in one branch.

Previously recompiling due to tensor array grad in branches weren't correctly handled.

PiperOrigin-RevId: 199661353
---
 tensorflow/compiler/tf2xla/kernels/if_op.cc | 30 ++++++++++++++++++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 8b9b026643..d48c6eea75 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -48,11 +48,11 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
 
   VLOG(1) << "Building If: " << input_types_.size() << " inputs";
 
-  std::vector<xla::XlaOp> inputs(input_types_.size());
   std::vector<XlaCompiler::Argument> arguments(input_types_.size());
   for (int i = 0; i < input_types_.size(); ++i) {
     XlaCompiler::Argument& arg = arguments[i];
     DataType type = ctx->input_type(i + 1);
+
     if (type == DT_RESOURCE) {
       XlaResource* resource;
       OP_REQUIRES_OK(ctx, ctx->GetResourceInput(i + 1, &resource));
@@ -60,7 +60,6 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.initialized = resource->initialized();
       arg.kind = XlaCompiler::Argument::kResource;
       arg.resource_kind = resource->kind();
-      OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b));
 
       arg.type = resource->type();
       arg.shape = resource->shape();
@@ -79,7 +78,6 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.kind = XlaCompiler::Argument::kParameter;
       arg.type = input_types_[i];
       arg.shape = ctx->InputShape(i + 1);
-      inputs[i] = ctx->Input(i + 1);
       VLOG(2) << "Arg type: " << DataTypeString(arg.type)
               << " shape: " << arg.shape.DebugString();
     }
@@ -100,6 +98,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_,
                                                 arguments, &else_result));
 
+  bool has_tensor_array_gradients = false;
   for (XlaCompiler::CompilationResult* result : {&then_result, &else_result}) {
     for (const XlaCompiler::ResourceUpdate& update : result->resource_updates) {
       XlaResource* resource;
@@ -121,9 +120,21 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
+      if (!resource->tensor_array_gradients().empty())
+        has_tensor_array_gradients = true;
     }
   }
 
+  // Recompile the functions to update the argument shapes for tensor arrays.
+  if (has_tensor_array_gradients) {
+    then_result = {};
+    OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, then_branch_,
+                                                  arguments, &then_result));
+    else_result = {};
+    OP_REQUIRES_OK(ctx, compiler->CompileFunction(options, else_branch_,
+                                                  arguments, &else_result));
+  }
+
   // Check that both branches have identical input shapes.
   OP_REQUIRES(ctx, then_result.xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
@@ -175,6 +186,19 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
             "Mismatch in resource of then and else branch for resource ", i));
   }
 
+  int num_inputs = then_result.input_mapping.size();
+  std::vector<xla::XlaOp> inputs(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    int input_num = then_result.input_mapping[i] + 1;
+    if (ctx->input_type(input_num) == DT_RESOURCE) {
+      XlaResource* resource;
+      OP_REQUIRES_OK(ctx, ctx->GetResourceInput(input_num, &resource));
+      OP_REQUIRES_OK(ctx, resource->Pack(&inputs[i], b));
+    } else {
+      inputs[i] = ctx->Input(i + 1);
+    }
+  }
+
   xla::XlaOp outputs =
       b->Conditional(ctx->Input(0), b->Tuple(inputs), *then_result.computation,
                      b->Tuple(inputs), *else_result.computation);
-- 
GitLab


From 0ea841d4bb79b0322dccad73728e428854d1aed2 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 7 Jun 2018 11:00:50 -0700
Subject: [PATCH 436/610] [TF:XLA] Bump open source llvm revision to r334038

PiperOrigin-RevId: 199662287
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index e66af3c8bc..b007d3f597 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -452,11 +452,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7488dbc1218de926f3de0e9bb3d465f3bbe5b80e.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/7488dbc1218de926f3de0e9bb3d465f3bbe5b80e.tar.gz",
       ],
-      sha256 = "6f782a0d2e9d7946bdf20807e0fcd8f5eaed8afd93bdd610cdefbe9435ca551f",
-      strip_prefix = "llvm-40c66c3d40377cf85640b3a35e6ec5c5b1cbc41f",
+      sha256 = "dd4a2e2a4f21ab69cf99534bcb2739c04fc12d12b63e5e3d8f2b85a2eb55d5d1",
+      strip_prefix = "llvm-7488dbc1218de926f3de0e9bb3d465f3bbe5b80e",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From 1485d75eb98d40d3770f0d3a850bc349e274b099 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 7 Jun 2018 11:09:08 -0700
Subject: [PATCH 437/610] Iterate over the K dimension in the innermost loop
 nest in the LLVM IR GEMM

This itself does not improve performance in the current tile sizes, shows
improvement with larger tiles (CL upcoming).

PiperOrigin-RevId: 199663960
---
 tensorflow/compiler/xla/service/cpu/BUILD     |  1 +
 .../xla/service/cpu/dot_op_emitter.cc         | 62 +++++++++++--------
 .../xla/service/cpu/vector_support_library.cc | 22 +++++++
 .../xla/service/cpu/vector_support_library.h  | 16 +++++
 4 files changed, 74 insertions(+), 27 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index f10d71fdba..d82922a359 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -882,6 +882,7 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
+        "//tensorflow/core:lib",
         "@llvm//:core",
         "@llvm//:support",
     ],
diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index cda623f8e8..fe4ba2a070 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -740,7 +740,7 @@ class MatrixMatrixBlockPanelEmitter {
  private:
   // The HandleResiduesOnX helpers split the iteration space for dimension X
   // into a multiple of the tile size on dimension X and an epilogue.  These
-  // helpers ultimately call into `EmitTiledReductionLoop` for emitting the
+  // helpers ultimately call into `EmitTiledGemm` for emitting the
   // tiled GEMM kernel.
 
   void HandleResiduesOnN();
@@ -750,15 +750,13 @@ class MatrixMatrixBlockPanelEmitter {
                          llvm::Value* k_start, llvm::Value* k_end,
                          llvm::Value* n_start, llvm::Value* n_end);
 
-  // This emits the inner reduction loop.  This inner reduction loop multiplies
-  // a tile from the LHS of size [tile_size_m,tile_size_k] and a tile from the
-  // RHS of size [`tile_size_k`, vls->vector_width()] to update a tile of size
-  // [`tile_size_m`, vls->vector_width()] in the result.
-  void EmitTiledReductionLoop(VectorSupportLibrary* vsl, int64 tile_size_k,
-                              llvm::Value* k_start, llvm::Value* k_end,
-                              llvm::Value* n_start, llvm::Value* n_end,
-                              int64 tile_size_m, llvm::Value* m_start,
-                              llvm::Value* m_end);
+  // This emits a tiled GEMM kernel.  For a detailed description see the comment
+  // on the implementation.
+  void EmitTiledGemm(VectorSupportLibrary* vsl, int64 tile_size_k,
+                     llvm::Value* k_start, llvm::Value* k_end,
+                     llvm::Value* n_start, llvm::Value* n_end,
+                     int64 tile_size_m, llvm::Value* m_start,
+                     llvm::Value* m_end);
 
   llvm::Value* GetInt64(int64 value) { return ir_builder_->getInt64(value); }
 
@@ -848,16 +846,24 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnM(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end) {
   const int64 m_end = dims().m() - dims().m() % tile_size_m();
-  EmitTiledReductionLoop(vsl, tile_size_k, k_start, k_end, n_start, n_end,
-                         tile_size_m(), GetInt64(0), GetInt64(m_end));
+  EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end, tile_size_m(),
+                GetInt64(0), GetInt64(m_end));
 
   if (m_end != dims().m()) {
-    EmitTiledReductionLoop(vsl, tile_size_k, k_start, k_end, n_start, n_end,
-                           dims().m() - m_end, GetInt64(m_end),
-                           GetInt64(dims().m()));
+    EmitTiledGemm(vsl, tile_size_k, k_start, k_end, n_start, n_end,
+                  dims().m() - m_end, GetInt64(m_end), GetInt64(dims().m()));
   }
 }
 
+// The loop structure is:
+//
+// Iterate over dimension M as m:
+//   Iterate over dimension N as n:
+//     Iterate over dimension K as k:
+//       OutputTile[m,n] += Dot(LhsTile[m,k], RhsTile[k,n])
+//
+// I.e. a just a tiled version of a "naive" GEMM.
+//
 // The tiling scheme is as follows:
 //
 // Let the LHS be:
@@ -919,7 +925,7 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnM(
 //   +-------------------+-------------------+-------------------+---------
 //   | a0*p0+b0*q0+c0*r0 | a0*p1+b0*q1+c0*r1 | a0*p2+b0*q2+c0*r2 |  ...
 //   +-------------------+-------------------+-------------------+---------
-void MatrixMatrixBlockPanelEmitter::EmitTiledReductionLoop(
+void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
     int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
@@ -933,16 +939,16 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledReductionLoop(
                                /*major_dim_offset=*/m_i,
                                /*tile_size_along_major_dim=*/tile_size_m);
 
-    ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-      MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_, dims().n(), k_i,
-                                 tile_size_k);
-      std::vector<std::vector<llvm::Value*>> lhs_tile =
-          lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
-      ksl_.For(
-          "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+    ksl_.For(
+        "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+          TileVariable result_tile_var(vsl, result_memory_tile.LoadTile(n_i));
+          ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
+            MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_, dims().n(), k_i,
+                                       tile_size_k);
+            std::vector<std::vector<llvm::Value*>> lhs_tile =
+                lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
             std::vector<llvm::Value*> rhs_tile = rhs_memory_tile.LoadTile(n_i);
-            std::vector<llvm::Value*> result_tile =
-                result_memory_tile.LoadTile(n_i);
+            std::vector<llvm::Value*> result_tile = result_tile_var.Get();
             for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
               for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
                 result_tile[r_m_i] =
@@ -950,9 +956,11 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledReductionLoop(
                                 result_tile[r_m_i]);
               }
             }
-            result_memory_tile.StoreTile(result_tile, n_i);
+            result_tile_var.Set(result_tile);
           });
-    });
+
+          result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
+        });
   });
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
index cd1165e238..c444d15185 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.cc
@@ -427,5 +427,27 @@ llvm::Value* LlvmVariable::Get() const {
 void LlvmVariable::Set(llvm::Value* new_value) {
   ir_builder_->CreateStore(new_value, alloca_);
 }
+
+TileVariable::TileVariable(VectorSupportLibrary* vector_support,
+                           std::vector<llvm::Value*> initial_value) {
+  for (llvm::Value* initial_vector_value : initial_value) {
+    storage_.emplace_back(vector_support, initial_vector_value);
+  }
+}
+
+std::vector<llvm::Value*> TileVariable::Get() const {
+  std::vector<llvm::Value*> result;
+  c_transform(storage_, std::back_inserter(result),
+              [&](VectorVariable vect_var) { return vect_var.Get(); });
+  return result;
+}
+
+void TileVariable::Set(tensorflow::gtl::ArraySlice<llvm::Value*> value) {
+  CHECK_EQ(value.size(), storage_.size());
+  for (int64 i = 0, e = value.size(); i < e; i++) {
+    storage_[i].Set(value[i]);
+  }
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/vector_support_library.h b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
index edcaec5849..49c2a4e2f4 100644
--- a/tensorflow/compiler/xla/service/cpu/vector_support_library.h
+++ b/tensorflow/compiler/xla/service/cpu/vector_support_library.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace xla {
 namespace cpu {
@@ -317,6 +318,21 @@ class ScalarVariable : public LlvmVariable {
     Set(initial_value);
   }
 };
+
+// This wraps a set of alloca-backed stack variables that can, as a whole, store
+// a tile.  A "tile" is a sequence of vectors that is typically used as a 2D
+// grid of scalar values (e.g. for tiled GEMMs).
+class TileVariable {
+ public:
+  TileVariable(VectorSupportLibrary* vector_support,
+               std::vector<llvm::Value*> initial_value);
+
+  std::vector<llvm::Value*> Get() const;
+  void Set(tensorflow::gtl::ArraySlice<llvm::Value*> value);
+
+ private:
+  std::vector<VectorVariable> storage_;
+};
 }  // namespace cpu
 }  // namespace xla
 
-- 
GitLab


From 4d47e9bc927ed29918a5524bfebe6075a4dccfb9 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Thu, 7 Jun 2018 11:34:34 -0700
Subject: [PATCH 438/610] Tune the GEMM tile size for broadwell

PiperOrigin-RevId: 199668758
---
 tensorflow/compiler/xla/service/cpu/dot_op_emitter.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
index 2effb7fc36..ed2a18976a 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.h
@@ -144,8 +144,12 @@ class DotOpEmitter {
   }
 
   std::tuple<int64, int64, int64> GetGemmTileSize() const {
+    // Tuned for broadwell - Intel(R) Xeon(R) CPU E5-2690 v4 @ 2.60GHz
+    //
+    // TODO(b/80093688): Tune for other architectures and centralize this
+    // information in one place.
     const std::tuple<int64, int64, int64> kDefaultTileSize =
-        std::tuple<int64, int64, int64>(3, 5, 1);
+        std::tuple<int64, int64, int64>(11, 9, 1);
     return options::LlvmIrGemmTileSize(hlo_module_config_)
         .value_or(kDefaultTileSize);
   }
-- 
GitLab


From e343b8072833765c85a5685b0f56b1b3d6add275 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 7 Jun 2018 11:36:47 -0700
Subject: [PATCH 439/610] Don't use `std::move()` on `const ...&` arguments.

PiperOrigin-RevId: 199669177
---
 tensorflow/core/kernels/data/iterator_ops.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 9d9e74adba..d71cac4ebc 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -782,7 +782,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
         return;
       }
     }
-    ProduceOutput(ctx, std::move(done));
+    ProduceOutput(ctx, done);
   }
 
  private:
@@ -803,9 +803,9 @@ class OneShotIteratorOp : public AsyncOpKernel {
     }
 
     for (auto&& ctx_done : callbacks_to_run) {
-      ProduceOutput(ctx_done.first, std::move(ctx_done.second));
+      ProduceOutput(ctx_done.first, ctx_done.second);
     }
-    ProduceOutput(ctx, std::move(done));
+    ProduceOutput(ctx, done);
   }
 
   Status TryInit(OpKernelContext* ctx, IteratorResource** iterator,
-- 
GitLab


From 642dc96bd4627a4f6305cf61b8553324054d9122 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 11:45:01 -0700
Subject: [PATCH 440/610] Add FillTriangular Bijector to create triangular
 matrices.

PiperOrigin-RevId: 199670547
---
 tensorflow/contrib/distributions/BUILD        |  19 +++
 .../bijectors/fill_triangular_test.py         |  98 ++++++++++++
 .../python/ops/bijectors/__init__.py          |   2 +
 .../python/ops/bijectors/fill_triangular.py   | 148 ++++++++++++++++++
 4 files changed, 267 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/fill_triangular_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 23d9dbcd91..d8baf49e81 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -940,6 +940,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "fill_triangular_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/fill_triangular_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "gumbel_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/fill_triangular_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/fill_triangular_test.py
new file mode 100644
index 0000000000..caeaf2a0c6
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/fill_triangular_test.py
@@ -0,0 +1,98 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for FillTriangular bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class FillTriangularBijectorTest(test.TestCase):
+  """Tests the correctness of the FillTriangular bijector."""
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijector(self):
+    x = np.float32(np.array([1., 2., 3.]))
+    y = np.float32(np.array([[3., 0.],
+                             [2., 1.]]))
+
+    b = bijectors.FillTriangular()
+
+    y_ = self.evaluate(b.forward(x))
+    self.assertAllClose(y, y_)
+
+    x_ = self.evaluate(b.inverse(y))
+    self.assertAllClose(x, x_)
+
+    fldj = self.evaluate(b.forward_log_det_jacobian(x, event_ndims=1))
+    self.assertAllClose(fldj, 0.)
+
+    ildj = self.evaluate(b.inverse_log_det_jacobian(y, event_ndims=2))
+    self.assertAllClose(ildj, 0.)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testShape(self):
+    x_shape = tensor_shape.TensorShape([5, 4, 6])
+    y_shape = tensor_shape.TensorShape([5, 4, 3, 3])
+
+    b = bijectors.FillTriangular(validate_args=True)
+
+    x = array_ops.ones(shape=x_shape, dtype=dtypes.float32)
+    y_ = b.forward(x)
+    self.assertAllEqual(y_.shape.as_list(), y_shape.as_list())
+    x_ = b.inverse(y_)
+    self.assertAllEqual(x_.shape.as_list(), x_shape.as_list())
+
+    y_shape_ = b.forward_event_shape(x_shape)
+    self.assertAllEqual(y_shape_.as_list(), y_shape.as_list())
+    x_shape_ = b.inverse_event_shape(y_shape)
+    self.assertAllEqual(x_shape_.as_list(), x_shape.as_list())
+
+    y_shape_tensor = self.evaluate(
+        b.forward_event_shape_tensor(x_shape.as_list()))
+    self.assertAllEqual(y_shape_tensor, y_shape.as_list())
+    x_shape_tensor = self.evaluate(
+        b.inverse_event_shape_tensor(y_shape.as_list()))
+    self.assertAllEqual(x_shape_tensor, x_shape.as_list())
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testShapeError(self):
+
+    b = bijectors.FillTriangular(validate_args=True)
+
+    x_shape_bad = tensor_shape.TensorShape([5, 4, 7])
+    with self.assertRaisesRegexp(ValueError, "is not a triangular number"):
+      b.forward_event_shape(x_shape_bad)
+    with self.assertRaisesOpError("is not a triangular number"):
+      self.evaluate(b.forward_event_shape_tensor(x_shape_bad.as_list()))
+
+    y_shape_bad = tensor_shape.TensorShape([5, 4, 3, 2])
+    with self.assertRaisesRegexp(ValueError, "Matrix must be square"):
+      b.inverse_event_shape(y_shape_bad)
+    with self.assertRaisesOpError("Matrix must be square"):
+      self.evaluate(b.inverse_event_shape_tensor(y_shape_bad.as_list()))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 4965381ef3..59b8cf1bb2 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -24,6 +24,7 @@
 @@CholeskyOuterProduct
 @@ConditionalBijector
 @@Exp
+@@FillTriangular
 @@Gumbel
 @@Identity
 @@Inline
@@ -64,6 +65,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.chain import *
 from tensorflow.contrib.distributions.python.ops.bijectors.cholesky_outer_product import *
 from tensorflow.contrib.distributions.python.ops.bijectors.conditional_bijector import *
 from tensorflow.contrib.distributions.python.ops.bijectors.exp import *
+from tensorflow.contrib.distributions.python.ops.bijectors.fill_triangular import *
 from tensorflow.contrib.distributions.python.ops.bijectors.gumbel import *
 from tensorflow.contrib.distributions.python.ops.bijectors.inline import *
 from tensorflow.contrib.distributions.python.ops.bijectors.invert import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py b/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
new file mode 100644
index 0000000000..7b06325ead
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/fill_triangular.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""FillTriangular bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.distributions import bijector
+from tensorflow.python.ops.distributions import util as dist_util
+
+
+__all__ = [
+    "FillTriangular",
+]
+
+
+class FillTriangular(bijector.Bijector):
+  """Transforms vectors to triangular.
+
+  Triangular matrix elements are filled in a clockwise spiral.
+
+  Given input with shape `batch_shape + [d]`, produces output with
+  shape `batch_shape + [n, n]`, where
+   `n = (-1 + sqrt(1 + 8 * d))/2`.
+  This follows by solving the quadratic equation
+   `d = 1 + 2 + ... + n = n * (n + 1)/2`.
+
+  #### Example
+
+  ```python
+  b = tfb.FillTriangular(upper=False)
+  b.forward([1, 2, 3, 4, 5, 6])
+  # ==> [[4, 0, 0],
+  #      [6, 5, 0],
+  #      [3, 2, 1]]
+
+  b = tfb.FillTriangular(upper=True)
+  b.forward([1, 2, 3, 4, 5, 6])
+  # ==> [[1, 2, 3],
+  #      [0, 5, 6],
+  #      [0, 0, 4]]
+
+  ```
+  """
+
+  def __init__(self,
+               upper=False,
+               validate_args=False,
+               name="fill_triangular"):
+    """Instantiates the `FillTriangular` bijector.
+
+    Args:
+      upper: Python `bool` representing whether output matrix should be upper
+        triangular (`True`) or lower triangular (`False`, default).
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._upper = upper
+    super(FillTriangular, self).__init__(
+        forward_min_event_ndims=1,
+        inverse_min_event_ndims=2,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    return dist_util.fill_triangular(x, upper=self._upper)
+
+  def _inverse(self, y):
+    return dist_util.fill_triangular_inverse(y, upper=self._upper)
+
+  def _forward_log_det_jacobian(self, x):
+    return array_ops.zeros_like(x[..., 0])
+
+  def _inverse_log_det_jacobian(self, y):
+    return array_ops.zeros_like(y[..., 0, 0])
+
+  def _forward_event_shape(self, input_shape):
+    batch_shape, d = input_shape[:-1], input_shape[-1].value
+    if d is None:
+      n = None
+    else:
+      n = vector_size_to_square_matrix_size(d, self.validate_args)
+    return batch_shape.concatenate([n, n])
+
+  def _inverse_event_shape(self, output_shape):
+    batch_shape, n1, n2 = (output_shape[:-2],
+                           output_shape[-2].value,
+                           output_shape[-1].value)
+    if n1 is None or n2 is None:
+      m = None
+    elif n1 != n2:
+      raise ValueError("Matrix must be square. (saw [{}, {}])".format(n1, n2))
+    else:
+      m = n1 * (n1 + 1) / 2
+    return batch_shape.concatenate([m])
+
+  def _forward_event_shape_tensor(self, input_shape_tensor):
+    batch_shape, d = input_shape_tensor[:-1], input_shape_tensor[-1]
+    n = vector_size_to_square_matrix_size(d, self.validate_args)
+    return array_ops.concat([batch_shape, [n, n]], axis=0)
+
+  def _inverse_event_shape_tensor(self, output_shape_tensor):
+    batch_shape, n = output_shape_tensor[:-2], output_shape_tensor[-1]
+    if self.validate_args:
+      is_square_matrix = check_ops.assert_equal(
+          n, output_shape_tensor[-2], message="Matrix must be square.")
+      with ops.control_dependencies([is_square_matrix]):
+        n = array_ops.identity(n)
+    d = math_ops.cast(n * (n + 1) / 2, output_shape_tensor.dtype)
+    return array_ops.concat([batch_shape, [d]], axis=0)
+
+
+def vector_size_to_square_matrix_size(d, validate_args, name=None):
+  """Convert a vector size to a matrix size."""
+  if isinstance(d, (float, int, np.generic, np.ndarray)):
+    n = (-1 + np.sqrt(1 + 8 * d)) / 2.
+    if float(int(n)) != n:
+      raise ValueError("Vector length is not a triangular number.")
+    return int(n)
+  else:
+    with ops.name_scope(name, "vector_size_to_square_matrix_size", [d]) as name:
+      n = (-1. + math_ops.sqrt(1 + 8. * math_ops.to_float(d))) / 2.
+      if validate_args:
+        with ops.control_dependencies([check_ops.assert_equal(
+            math_ops.to_float(math_ops.to_int32(n)), n,
+            message="Vector length is not a triangular number")]):
+          n = array_ops.identity(n)
+      return math_ops.cast(n, d.dtype)
-- 
GitLab


From f9acd2548a508fc90357e93ad2b5efb2611ccb98 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 12:03:44 -0700
Subject: [PATCH 441/610] [XLA] Redesign: delete versioned_computation_handle
 and compilation_cache.

PiperOrigin-RevId: 199673573
---
 tensorflow/compiler/xla/service/BUILD         | 32 --------
 .../compiler/xla/service/channel_tracker.h    |  1 -
 .../compiler/xla/service/compilation_cache.cc | 78 -------------------
 .../compiler/xla/service/compilation_cache.h  | 78 -------------------
 .../xla/service/copy_insertion_test.cc        |  9 +--
 tensorflow/compiler/xla/service/executable.h  |  7 --
 .../xla/service/gpu/hlo_schedule_test.cc      |  3 +-
 .../xla/service/gpu/stream_assignment_test.cc |  3 +-
 .../xla/service/hlo_evaluator_test.cc         |  2 +-
 tensorflow/compiler/xla/service/hlo_module.cc | 17 +---
 tensorflow/compiler/xla/service/hlo_module.h  | 17 +---
 .../compiler/xla/service/local_service.cc     |  1 -
 tensorflow/compiler/xla/service/service.h     |  5 --
 .../service/versioned_computation_handle.cc   | 32 --------
 .../service/versioned_computation_handle.h    | 55 -------------
 .../compiler/xla/tests/hlo_test_base.cc       |  3 +-
 .../compiler/xla/tests/llvm_compiler_test.cc  |  3 +-
 17 files changed, 11 insertions(+), 335 deletions(-)
 delete mode 100644 tensorflow/compiler/xla/service/compilation_cache.cc
 delete mode 100644 tensorflow/compiler/xla/service/compilation_cache.h
 delete mode 100644 tensorflow/compiler/xla/service/versioned_computation_handle.cc
 delete mode 100644 tensorflow/compiler/xla/service/versioned_computation_handle.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 20cc671ba3..89de302f4d 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -292,7 +292,6 @@ cc_library(
         ":hlo_proto",
         ":hlo_reachability",
         ":name_uniquer",
-        ":versioned_computation_handle",
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
@@ -401,17 +400,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "versioned_computation_handle",
-    srcs = ["versioned_computation_handle.cc"],
-    hdrs = ["versioned_computation_handle.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
 tf_cc_test(
     name = "hlo_instruction_test",
     srcs = ["hlo_instruction_test.cc"],
@@ -591,7 +579,6 @@ cc_library(
         ":allocation_tracker",
         ":backend",
         ":channel_tracker",
-        ":compilation_cache",
         ":compiler",
         ":computation_layout",
         ":device_memory_allocator",
@@ -606,7 +593,6 @@ cc_library(
         ":platform_util",
         ":source_map_util",
         ":transfer_manager",
-        ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:service_interface",
@@ -641,7 +627,6 @@ cc_library(
         ":platform_util",
         ":service",
         ":shaped_buffer",
-        ":versioned_computation_handle",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -762,7 +747,6 @@ cc_library(
         ":hlo_proto",
         ":pool",
         ":shaped_buffer",
-        ":versioned_computation_handle",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
@@ -864,7 +848,6 @@ cc_library(
     hdrs = ["channel_tracker.h"],
     deps = [
         ":hlo",
-        ":versioned_computation_handle",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -1646,7 +1629,6 @@ tf_cc_test(
         ":hlo_cost_analysis",
         ":local_service",
         ":service",
-        ":versioned_computation_handle",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
@@ -1987,20 +1969,6 @@ tf_cc_test(
     ],
 )
 
-cc_library(
-    name = "compilation_cache",
-    srcs = ["compilation_cache.cc"],
-    hdrs = ["compilation_cache.h"],
-    deps = [
-        ":executable",
-        ":hlo_module_config",
-        ":versioned_computation_handle",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "layout_assignment",
     srcs = [
diff --git a/tensorflow/compiler/xla/service/channel_tracker.h b/tensorflow/compiler/xla/service/channel_tracker.h
index 52f33a1318..fac0afd672 100644
--- a/tensorflow/compiler/xla/service/channel_tracker.h
+++ b/tensorflow/compiler/xla/service/channel_tracker.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <map>
 
 #include "tensorflow/compiler/xla/service/hlo_module.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/service/compilation_cache.cc b/tensorflow/compiler/xla/service/compilation_cache.cc
deleted file mode 100644
index b16907da9e..0000000000
--- a/tensorflow/compiler/xla/service/compilation_cache.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/compilation_cache.h"
-
-#include <utility>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/core/lib/strings/strcat.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace xla {
-
-std::shared_ptr<Executable> CompilationCache::Insert(
-    std::unique_ptr<Executable> executable,
-    const HloModuleConfig& module_config) {
-  tensorflow::mutex_lock lock(mutex_);
-
-  CacheKey key =
-      BuildKey(executable->entry_computation_handle(), module_config);
-  VLOG(2) << "inserting cache key: " << key;
-  if (cache_.count(key) == 0) {
-    cache_.emplace(key, std::move(executable));
-  } else {
-    // Executable already exists in the cache. This can happen if two Execute
-    // calls for a new computation are received simultaneously by the
-    // service. In this case, we discard the Executable given as a parameter and
-    // return what is in the cache. This is necessary because the service relies
-    // on the cache to keep ownership of the Executable. We only want to store
-    // one Executable for a given computation version and we can't discard the
-    // executable which is in the cache because it may be in use.
-    executable.reset();
-  }
-  return cache_.at(key);
-}
-
-std::shared_ptr<Executable> CompilationCache::LookUp(
-    const VersionedComputationHandle& versioned_handle,
-    const HloModuleConfig& module_config) const {
-  tensorflow::mutex_lock lock(mutex_);
-
-  CacheKey key = BuildKey(versioned_handle, module_config);
-  VLOG(2) << "looking up cache key: " << key;
-  if (cache_.count(key) == 0) {
-    VLOG(2) << "cache key not found: " << key;
-    return nullptr;
-  } else {
-    std::shared_ptr<Executable> result = cache_.at(key);
-    VLOG(2) << "hit executable with module config: "
-            << result->module_config().compilation_cache_key();
-    return result;
-  }
-}
-
-CompilationCache::CacheKey CompilationCache::BuildKey(
-    const VersionedComputationHandle& versioned_handle,
-    const HloModuleConfig& module_config) const {
-  // The computation shape is represented entirely by its ProgramShape member,
-  // so just serialize the proto as part of the key.
-  return tensorflow::strings::StrCat(versioned_handle.handle.handle(), "::",
-                                     versioned_handle.version, "::",
-                                     module_config.compilation_cache_key());
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/compilation_cache.h b/tensorflow/compiler/xla/service/compilation_cache.h
deleted file mode 100644
index 09989726ae..0000000000
--- a/tensorflow/compiler/xla/service/compilation_cache.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
-
-#include <map>
-#include <memory>
-#include <string>
-
-#include "tensorflow/compiler/xla/service/executable.h"
-#include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-
-namespace xla {
-
-// A cache which stores Executables indexed by computation handle and version.
-class CompilationCache {
- public:
-  CompilationCache() {}
-
-  // Insert the given Executable into the cache. Return a bare Executable
-  // pointer for the caller to use. Note: the returned pointer will *not* be the
-  // same as the given unique pointer if the computation already exists in the
-  // cache. See comments in the .cc implementation for details of this case.
-  //
-  // module_config is provided by the caller, instead of being taken from the
-  // executable, so that we can insert keys into the compilation cache that are
-  // devoid of layout (where XLA gets to choose what layout to compile).
-  //
-  // A shared_ptr is returned so the caller can keep the Executable from being
-  // destructed in the event that the Executable is evicted from the
-  // computation cache (and the cache's shared_ptr to the Executable is
-  // destructed).
-  std::shared_ptr<Executable> Insert(std::unique_ptr<Executable> executable,
-                                     const HloModuleConfig& module_config);
-
-  // Lookup the Executable for the specified versioned computation in the cache.
-  // Return a shared_ptr to the Executable if it exists in the cache. Return
-  // nullptr otherwise.
-  std::shared_ptr<Executable> LookUp(
-      const VersionedComputationHandle& versioned_handle,
-      const HloModuleConfig& module_config) const;
-
- protected:
-  mutable tensorflow::mutex mutex_;
-
-  // Map from versioned handle with program layout to Executable built
-  // for that computation version and program layout.
-  using CacheKey = string;
-
-  CacheKey BuildKey(const VersionedComputationHandle& versioned_handle,
-                    const HloModuleConfig& module_config) const;
-  std::map<CacheKey, std::shared_ptr<Executable>> cache_ GUARDED_BY(mutex_);
-
- private:
-  TF_DISALLOW_COPY_AND_ASSIGN(CompilationCache);
-};
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COMPILATION_CACHE_H_
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 153f062d01..684fff8a6f 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -1636,8 +1636,7 @@ void BM_SequentialWhiles(int num_iters, int num_whiles) {
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
     config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    HloModule module("BM_SequentialWhiles", VersionedComputationHandle(),
-                     config);
+    HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_SequentialWhiles");
     HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1677,8 +1676,7 @@ void BM_ParallelWhiles(int num_iters, int num_whiles) {
   for (int i = 0; i < num_iters; ++i) {
     HloModuleConfig config;
     config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    HloModule module("BM_SequentialWhiles", VersionedComputationHandle(),
-                     config);
+    HloModule module("BM_SequentialWhiles", config);
 
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
     HloInstruction* x = builder.AddInstruction(HloInstruction::CreateParameter(
@@ -1750,8 +1748,7 @@ void BM_ManyElementTuple(int num_iters, const int num_tuple_inputs) {
   std::vector<HloInstruction*> tuple_params(num_tuple_inputs);
   for (int i = 0; i < num_iters; ++i) {
     auto builder = HloComputation::Builder("BM_ParallelWhiles");
-    HloModule module("BM_ManyElementTuple", VersionedComputationHandle(),
-                     config);
+    HloModule module("BM_ManyElementTuple", config);
     for (int j = 0; j < num_tuple_inputs; ++j) {
       tuple_params[j] = builder.AddInstruction(
           HloInstruction::CreateParameter(j, element_shape, ""));
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 087bd14329..dc1f26ea65 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -131,12 +130,6 @@ class Executable {
 
   const HloModuleConfig& module_config() const { return hlo_module_->config(); }
 
-  // Returns the versioned computation handle of the computation computed by
-  // this executable.
-  const VersionedComputationHandle& entry_computation_handle() const {
-    return hlo_module_->entry_computation_handle();
-  }
-
   // The shape (including layout) that results from this execution. This is the
   // shape of the DeviceMemoryBase result value in ExecuteOnStream above.
   const Shape& host_result_shape() const {
diff --git a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
index e230d538cc..45f0a1c645 100644
--- a/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/hlo_schedule_test.cc
@@ -47,8 +47,7 @@ class HloScheduleTest : public HloTestBase {
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
     config.set_debug_options(debug_options);
-    return MakeUnique<HloModule>("test_module", VersionedComputationHandle(),
-                                 config);
+    return MakeUnique<HloModule>("test_module", config);
   }
 
   HloVec RemoveHlo(const HloVec& input,
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index 696fa7e019..6f4bb0580e 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -33,8 +33,7 @@ class StreamAssignmentTest : public HloTestBase {
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
     config.set_debug_options(debug_options);
-    return MakeUnique<HloModule>("test_module", VersionedComputationHandle(),
-                                 config);
+    return MakeUnique<HloModule>("test_module", config);
   }
 
   // Pre-canned shapes.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index 84b4ead2dd..72eb9930e9 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -1248,7 +1248,7 @@ void BM_ReducePrecisely(int num_iters) {
   HloComputation::Builder b("BM_ReducePrecisely");
   HloModuleConfig config;
   config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-  HloModule module("BM_ReducePrecisely", VersionedComputationHandle(), config);
+  HloModule module("BM_ReducePrecisely", config);
 
   constexpr int kNumElements = 1 << 25;  // float += 1 saturates at 1<<24
   std::vector<float> v(kNumElements, 1.0f);
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index e63424c2df..ab60258677 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -32,15 +32,6 @@ limitations under the License.
 
 namespace xla {
 
-HloModule::HloModule(const string& name,
-                     const VersionedComputationHandle& entry_computation_handle,
-                     const HloModuleConfig& config)
-    : name_(NameUniquer::GetSanitizedName(name)),
-      config_(config),
-      has_entry_computation_handle_(true),
-      entry_computation_handle_(entry_computation_handle),
-      unique_id_(next_unique_module_id_++) {}
-
 HloModule::HloModule(const string& name, const HloModuleConfig& config)
     : name_(NameUniquer::GetSanitizedName(name)),
       config_(config),
@@ -234,8 +225,7 @@ HloModuleProto HloModule::ToProto() const {
 
 /* static */
 StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
-    const HloModuleProto& proto, const HloModuleConfig& module_config,
-    const VersionedComputationHandle& entry_computation_handle) {
+    const HloModuleProto& proto, const HloModuleConfig& module_config) {
   // The ProgramShape in the passed in module config must match the shapes of
   // the entry parameters and root.
   TF_RET_CHECK(proto.has_program_shape())
@@ -287,8 +277,7 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   }
   TF_RET_CHECK(entry != nullptr);
 
-  auto module = MakeUnique<HloModule>(proto.name(), entry_computation_handle,
-                                      module_config);
+  auto module = MakeUnique<HloModule>(proto.name(), module_config);
 
   // Sort the computations in the proto id's order.
   std::sort(computations.begin(), computations.end(),
@@ -525,8 +514,6 @@ std::vector<HloComputation*> HloModule::MakeNonfusionComputations() const {
 std::unique_ptr<HloModule> HloModule::Clone(const string& suffix) const {
   VLOG(1) << "Cloning module :" << name_ << " --> " << suffix << "\n";
   auto module = MakeUnique<HloModule>(name_ + "-" + suffix, config_);
-  module->entry_computation_handle_ = entry_computation_handle_;
-  module->has_entry_computation_handle_ = has_entry_computation_handle_;
 
   HloCloneContext context(module.get(), suffix);
   auto cloned_computation = entry_computation_->Clone(suffix, &context);
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index c93c74d34a..757e65bda2 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -31,7 +31,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -57,10 +56,6 @@ namespace xla {
 // attached to.
 class HloModule {
  public:
-  HloModule(const string& name,
-            const VersionedComputationHandle& entry_computation_handle,
-            const HloModuleConfig& config);
-
   // Constructor without a versioned computation handle. This constructor should
   // only be used for HloModules used outside of the XLA service (eg
   // tests). The versioned handle is used by the service in the compilation
@@ -126,10 +121,6 @@ class HloModule {
     return config_.device_entry_computation_layout();
   }
 
-  const VersionedComputationHandle& entry_computation_handle() const {
-    return entry_computation_handle_;
-  }
-
   // Gets the computations in this module.
   //
   // Returns a view of HloComputation*s, so you can iterate over this in the
@@ -188,9 +179,7 @@ class HloModule {
   // Convert an HloModule to or from a proto.
   HloModuleProto ToProto() const;
   static StatusOr<std::unique_ptr<HloModule>> CreateFromProto(
-      const HloModuleProto& proto, const HloModuleConfig& module_config,
-      const VersionedComputationHandle& entry_computation_handle =
-          VersionedComputationHandle());
+      const HloModuleProto& proto, const HloModuleConfig& module_config);
 
   // Creates and returns an HloModuleConfig with an appropriate program shape
   // for the HLO module in the given proto.
@@ -264,10 +253,6 @@ class HloModule {
   mutable std::mt19937_64 rng_{42};
   mutable tensorflow::mutex rng_mutex_;
 
-  // Versioned handle of the entry computation of the module.
-  bool has_entry_computation_handle_ = false;
-  VersionedComputationHandle entry_computation_handle_;
-
   // Unique name generator for computation and instruction names, which are
   // unique per module.
   NameUniquer computation_name_uniquer_{/*separator=*/"."};
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index 1d9c9e0678..296d04d436 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/service/service.h b/tensorflow/compiler/xla/service/service.h
index d64b2b4d0a..8748a4c144 100644
--- a/tensorflow/compiler/xla/service/service.h
+++ b/tensorflow/compiler/xla/service/service.h
@@ -26,14 +26,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/allocation_tracker.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/channel_tracker.h"
-#include "tensorflow/compiler/xla/service/compilation_cache.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/execution_tracker.h"
 #include "tensorflow/compiler/xla/service/hlo_execution_profile.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
 #include "tensorflow/compiler/xla/service_interface.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -297,9 +295,6 @@ class Service : public ServiceInterface {
   // Tracks asynchronously launched executions via the API.
   ExecutionTracker execution_tracker_;
 
-  // Cache containing previously built Executables.
-  CompilationCache compilation_cache_;
-
   // Backend to compile and execute computations on.
   std::unique_ptr<Backend> execute_backend_;
 
diff --git a/tensorflow/compiler/xla/service/versioned_computation_handle.cc b/tensorflow/compiler/xla/service/versioned_computation_handle.cc
deleted file mode 100644
index a693c4695f..0000000000
--- a/tensorflow/compiler/xla/service/versioned_computation_handle.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/xla/service/versioned_computation_handle.h"
-
-#include "tensorflow/core/lib/strings/strcat.h"
-
-namespace xla {
-
-string VersionedComputationHandle::ToString() const {
-  return tensorflow::strings::StrCat(handle.handle(), ":v", version);
-}
-
-std::ostream& operator<<(std::ostream& out,
-                         const VersionedComputationHandle& versioned_handle) {
-  out << versioned_handle.ToString();
-  return out;
-}
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/versioned_computation_handle.h b/tensorflow/compiler/xla/service/versioned_computation_handle.h
deleted file mode 100644
index 5732a56caf..0000000000
--- a/tensorflow/compiler/xla/service/versioned_computation_handle.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_
-#define TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_
-
-#include <ostream>
-
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace xla {
-
-// A data structure encapsulating a ComputationHandle and version value of that
-// computation. This object is used to unambiguously refer to a particular
-// computation in the service.
-struct VersionedComputationHandle {
-  // A version value unambiguously specifying the state of the computation at a
-  // particular point in time as it is being built. This value is the
-  // ComputationDataHandle of the current root instruction.
-  using Version = int64;
-
-  ComputationHandle handle;
-  Version version;
-
-  string ToString() const;
-  bool operator==(const VersionedComputationHandle& other) const {
-    return (handle.handle() == other.handle.handle()) &&
-           (version == other.version);
-  }
-  bool operator<(const VersionedComputationHandle& other) const {
-    return ((handle.handle() < other.handle.handle()) ||
-            ((handle.handle() == other.handle.handle()) &&
-             (version < other.version)));
-  }
-};
-
-std::ostream& operator<<(std::ostream& out,
-                         const VersionedComputationHandle& versioned_handle);
-
-}  // namespace xla
-
-#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_VERSIONED_COMPUTATION_HANDLE_H_
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 08ed826c80..242cc5db11 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -94,8 +94,7 @@ HloTestBase::HloTestBase(se::Platform* test_platform,
 
 /* static */
 std::unique_ptr<HloModule> HloTestBase::CreateNewModule(const string& name) {
-  return MakeUnique<HloModule>(name, VersionedComputationHandle(),
-                               GetModuleConfigForTest());
+  return MakeUnique<HloModule>(name, GetModuleConfigForTest());
 }
 
 /*static*/ DebugOptions HloTestBase::GetDebugOptionsForTest() {
diff --git a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
index 2f46ee0be2..082bc34136 100644
--- a/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
+++ b/tensorflow/compiler/xla/tests/llvm_compiler_test.cc
@@ -124,8 +124,7 @@ class LLVMCompilerTest : public ::testing::Test {
   static std::unique_ptr<HloModule> CreateNewModule() {
     HloModuleConfig config;
     config.set_debug_options(legacy_flags::GetDebugOptionsFromFlags());
-    return MakeUnique<HloModule>(TestName(), VersionedComputationHandle(),
-                                 config);
+    return MakeUnique<HloModule>(TestName(), config);
   }
 };
 
-- 
GitLab


From 4d0d60a82c52c6c71650db33bf826f03559d91fc Mon Sep 17 00:00:00 2001
From: Priya Gupta <priyag@google.com>
Date: Thu, 7 Jun 2018 12:03:52 -0700
Subject: [PATCH 442/610] Expand DistributionStrategy.group to address single
 variable case properly as well, in addition to a single Tensor case.

PiperOrigin-RevId: 199673590
---
 tensorflow/python/training/distribute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index ab8b37bb65..7cd175f25b 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -946,7 +946,7 @@ class DistributionStrategy(object):
       return control_flow_ops.group(value, name=name)
     # Special handling for the common case of one op.
     v, = value
-    if isinstance(v, ops.Tensor):
+    if hasattr(v, "op"):
       v = v.op
     return v
 
-- 
GitLab


From 501cf726cbee2ee13efef43884a6552ca211979d Mon Sep 17 00:00:00 2001
From: Michael Case <mikecase@google.com>
Date: Thu, 7 Jun 2018 12:05:24 -0700
Subject: [PATCH 443/610] Internal Change.

PiperOrigin-RevId: 199673803
---
 tensorflow/BUILD                              |  7 ++-
 tensorflow/api_template.__init__.py           | 17 +++++-
 tensorflow/contrib/cmake/tf_python.cmake      | 45 ++++++++++++++
 tensorflow/python/estimator/BUILD             |  4 ++
 tensorflow/python/estimator/api/BUILD         | 17 ++++++
 .../python/estimator/canned/baseline.py       |  6 +-
 .../python/estimator/canned/boosted_trees.py  |  6 +-
 tensorflow/python/estimator/canned/dnn.py     |  6 +-
 .../estimator/canned/dnn_linear_combined.py   |  6 +-
 tensorflow/python/estimator/canned/linear.py  |  6 +-
 .../python/estimator/canned/parsing_utils.py  |  6 +-
 tensorflow/python/estimator/estimator.py      | 12 ++--
 tensorflow/python/estimator/export/export.py  | 10 ++--
 .../python/estimator/export/export_output.py  | 10 ++--
 tensorflow/python/estimator/exporter.py       | 10 ++--
 .../python/estimator/inputs/numpy_io.py       |  4 +-
 .../python/estimator/inputs/pandas_io.py      |  4 +-
 tensorflow/python/estimator/model_fn.py       |  6 +-
 tensorflow/python/estimator/run_config.py     |  4 +-
 tensorflow/python/estimator/training.py       |  8 +--
 tensorflow/python/util/tf_export.py           | 58 ++++++++++++-------
 tensorflow/python/util/tf_export_test.py      |  7 ---
 tensorflow/tools/api/generator/api_gen.bzl    | 20 +++++--
 .../tools/api/generator/create_python_api.py  | 35 ++++++-----
 .../api/generator/create_python_api_test.py   |  9 ++-
 25 files changed, 218 insertions(+), 105 deletions(-)
 create mode 100644 tensorflow/python/estimator/api/BUILD

diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index e0bce820d1..a73c4ca3aa 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -541,14 +541,17 @@ exports_files(
 )
 
 gen_api_init_files(
-    name = "python_api_gen",
+    name = "tensorflow_python_api_gen",
     srcs = ["api_template.__init__.py"],
     root_init_template = "api_template.__init__.py",
 )
 
 py_library(
     name = "tensorflow_py",
-    srcs = [":python_api_gen"],
+    srcs = [
+        ":tensorflow_python_api_gen",
+        "//tensorflow/python/estimator/api:estimator_python_api_gen",
+    ],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = ["//tensorflow/python"],
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 9b0d7d48af..9662d7b478 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -22,7 +22,22 @@ from __future__ import print_function
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.util.lazy_loader import LazyLoader
+try:
+  import os  # pylint: disable=g-import-not-at-top
+  # Add `estimator` attribute to allow access to estimator APIs via
+  # "tf.estimator..."
+  from tensorflow.python.estimator.api import estimator  # pylint: disable=g-import-not-at-top
+
+  # Add `estimator` to the __path__ to allow "from tensorflow.estimator..."
+  # style imports.
+  from tensorflow.python.estimator import api as estimator_api  # pylint: disable=g-import-not-at-top
+  __path__ += [os.path.dirname(estimator_api.__file__)]
+  del estimator_api
+  del os
+except (ImportError, AttributeError):
+  print('tf.estimator package not installed.')
+
+from tensorflow.python.util.lazy_loader import LazyLoader  # pylint: disable=g-import-not-at-top
 contrib = LazyLoader('contrib', globals(), 'tensorflow.contrib')
 del LazyLoader
 
diff --git a/tensorflow/contrib/cmake/tf_python.cmake b/tensorflow/contrib/cmake/tf_python.cmake
index d019dd48f2..a0c3ddd28b 100755
--- a/tensorflow/contrib/cmake/tf_python.cmake
+++ b/tensorflow/contrib/cmake/tf_python.cmake
@@ -756,6 +756,8 @@ add_custom_command(
               "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
               "--root_init_template=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/api_template.__init__.py"
               "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow"
+              "--package=tensorflow.python"
+              "--apiname=tensorflow"
               "${api_init_list_file}"
 
       COMMENT "Generating __init__.py files for Python API."
@@ -765,7 +767,49 @@ add_custom_command(
 add_custom_target(tf_python_api SOURCES ${api_init_files})
 add_dependencies(tf_python_api tf_python_ops)
 
+# TODO(mikecase): This can be removed once tf.estimator is moved
+# out of TensorFlow.
+########################################################
+# Generate API __init__.py files for tf.estimator.
+########################################################
+
+# Parse tensorflow/tools/api/generator/BUILD to get list of generated files.
+FILE(READ ${tensorflow_source_dir}/tensorflow/tools/api/generator/api_gen.bzl api_generator_BUILD_text)
+STRING(REGEX MATCH "# BEGIN GENERATED ESTIMATOR FILES.*# END GENERATED ESTIMATOR FILES" api_init_files_text ${api_generator_BUILD_text})
+string(REPLACE "# BEGIN GENERATED ESTIMATOR FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "# END GENERATED ESTIMATOR FILES" "" api_init_files_text ${api_init_files_text})
+string(REPLACE "," ";" api_init_files_list ${api_init_files_text})
+
+set(api_init_files "")
+foreach(api_init_file ${api_init_files_list})
+    string(STRIP "${api_init_file}" api_init_file)
+    if(api_init_file)
+        string(REPLACE "\"" "" api_init_file "${api_init_file}")  # Remove quotes
+        list(APPEND api_init_files "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/estimator/api/${api_init_file}")
+    endif()
+endforeach(api_init_file)
+set(estimator_api_init_list_file "${tensorflow_source_dir}/estimator_api_init_files_list.txt")
+file(WRITE "${estimator_api_init_list_file}" "${api_init_files}")
+
+# Run create_python_api.py to generate __init__.py files.
+add_custom_command(
+      OUTPUT ${api_init_files}
+      DEPENDS tf_python_ops tf_python_copy_scripts_to_destination pywrap_tensorflow_internal tf_python_touchup_modules tf_extension_ops
+
+      # Run create_python_api.py to generate API init files.
+      COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${CMAKE_CURRENT_BINARY_DIR}/tf_python ${PYTHON_EXECUTABLE}
+              "${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/tools/api/generator/create_python_api.py"
+              "--apidir=${CMAKE_CURRENT_BINARY_DIR}/tf_python/tensorflow/python/estimator/api"
+              "--package=tensorflow.python.estimator"
+              "--apiname=estimator"
+              "${estimator_api_init_list_file}"
+
+      COMMENT "Generating __init__.py files for Python API."
+      WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/tf_python"
+)
 
+add_custom_target(estimator_python_api SOURCES ${api_init_files})
+add_dependencies(estimator_python_api tf_python_ops)
 ############################################################
 # Build a PIP package containing the TensorFlow runtime.
 ############################################################
@@ -776,6 +820,7 @@ add_dependencies(tf_python_build_pip_package
     tf_python_touchup_modules
     tf_python_ops
     tf_python_api
+    estimator_python_api
     tf_extension_ops)
 
 # Fix-up Python files that were not included by the add_python_module() macros.
diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index d538c6c415..c0d63b79a6 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -12,6 +12,10 @@ py_library(
     name = "estimator_py",
     srcs = ["estimator_lib.py"],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//tensorflow:__pkg__",
+        "//tensorflow:internal",
+    ],
     deps = [
         ":baseline",
         ":boosted_trees",
diff --git a/tensorflow/python/estimator/api/BUILD b/tensorflow/python/estimator/api/BUILD
new file mode 100644
index 0000000000..cddee9b8f3
--- /dev/null
+++ b/tensorflow/python/estimator/api/BUILD
@@ -0,0 +1,17 @@
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow/tools/api/generator:api_gen.bzl", "gen_api_init_files")
+load("//tensorflow/tools/api/generator:api_gen.bzl", "ESTIMATOR_API_INIT_FILES")
+
+gen_api_init_files(
+    name = "estimator_python_api_gen",
+    api_name = "estimator",
+    output_files = ESTIMATOR_API_INIT_FILES,
+    package = "tensorflow.python.estimator",
+)
diff --git a/tensorflow/python/estimator/canned/baseline.py b/tensorflow/python/estimator/canned/baseline.py
index 980c057372..3c6816cb03 100644
--- a/tensorflow/python/estimator/canned/baseline.py
+++ b/tensorflow/python/estimator/canned/baseline.py
@@ -59,7 +59,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.training import training_util
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 # The default learning rate of 0.3 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -174,7 +174,7 @@ def _baseline_model_fn(features, labels, mode, head, optimizer,
       train_op_fn=train_op_fn)
 
 
-@tf_export('estimator.BaselineClassifier')
+@estimator_export('estimator.BaselineClassifier')
 class BaselineClassifier(estimator.Estimator):
   """A classifier that can establish a simple baseline.
 
@@ -277,7 +277,7 @@ class BaselineClassifier(estimator.Estimator):
         config=config)
 
 
-@tf_export('estimator.BaselineRegressor')
+@estimator_export('estimator.BaselineRegressor')
 class BaselineRegressor(estimator.Estimator):
   """A regressor that can establish a simple baseline.
 
diff --git a/tensorflow/python/estimator/canned/boosted_trees.py b/tensorflow/python/estimator/canned/boosted_trees.py
index 4e6010a162..6b54f51ca6 100644
--- a/tensorflow/python/estimator/canned/boosted_trees.py
+++ b/tensorflow/python/estimator/canned/boosted_trees.py
@@ -39,7 +39,7 @@ from tensorflow.python.summary import summary
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 # TODO(nponomareva): Reveal pruning params here.
 _TreeHParams = collections.namedtuple('TreeHParams', [
@@ -712,7 +712,7 @@ def _create_regression_head(label_dimension, weight_column=None):
   # pylint: enable=protected-access
 
 
-@tf_export('estimator.BoostedTreesClassifier')
+@estimator_export('estimator.BoostedTreesClassifier')
 class BoostedTreesClassifier(estimator.Estimator):
   """A Classifier for Tensorflow Boosted Trees models."""
 
@@ -830,7 +830,7 @@ class BoostedTreesClassifier(estimator.Estimator):
         model_fn=_model_fn, model_dir=model_dir, config=config)
 
 
-@tf_export('estimator.BoostedTreesRegressor')
+@estimator_export('estimator.BoostedTreesRegressor')
 class BoostedTreesRegressor(estimator.Estimator):
   """A Regressor for Tensorflow Boosted Trees models."""
 
diff --git a/tensorflow/python/estimator/canned/dnn.py b/tensorflow/python/estimator/canned/dnn.py
index 1feac36f35..b924ad5df4 100644
--- a/tensorflow/python/estimator/canned/dnn.py
+++ b/tensorflow/python/estimator/canned/dnn.py
@@ -32,7 +32,7 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -201,7 +201,7 @@ def _dnn_model_fn(features,
           logits=logits)
 
 
-@tf_export('estimator.DNNClassifier')
+@estimator_export('estimator.DNNClassifier')
 class DNNClassifier(estimator.Estimator):
   """A classifier for TensorFlow DNN models.
 
@@ -353,7 +353,7 @@ class DNNClassifier(estimator.Estimator):
         warm_start_from=warm_start_from)
 
 
-@tf_export('estimator.DNNRegressor')
+@estimator_export('estimator.DNNRegressor')
 class DNNRegressor(estimator.Estimator):
   """A regressor for TensorFlow DNN models.
 
diff --git a/tensorflow/python/estimator/canned/dnn_linear_combined.py b/tensorflow/python/estimator/canned/dnn_linear_combined.py
index 95efc0a028..64d81c46ce 100644
--- a/tensorflow/python/estimator/canned/dnn_linear_combined.py
+++ b/tensorflow/python/estimator/canned/dnn_linear_combined.py
@@ -37,7 +37,7 @@ from tensorflow.python.summary import summary
 from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training_util
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 # The default learning rates are a historical artifact of the initial
 # implementation.
@@ -225,7 +225,7 @@ def _dnn_linear_combined_model_fn(features,
       logits=logits)
 
 
-@tf_export('estimator.DNNLinearCombinedClassifier')
+@estimator_export('estimator.DNNLinearCombinedClassifier')
 class DNNLinearCombinedClassifier(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined classification models.
 
@@ -406,7 +406,7 @@ class DNNLinearCombinedClassifier(estimator.Estimator):
         warm_start_from=warm_start_from)
 
 
-@tf_export('estimator.DNNLinearCombinedRegressor')
+@estimator_export('estimator.DNNLinearCombinedRegressor')
 class DNNLinearCombinedRegressor(estimator.Estimator):
   """An estimator for TensorFlow Linear and DNN joined models for regression.
 
diff --git a/tensorflow/python/estimator/canned/linear.py b/tensorflow/python/estimator/canned/linear.py
index 81657f0c01..705fc3ce06 100644
--- a/tensorflow/python/estimator/canned/linear.py
+++ b/tensorflow/python/estimator/canned/linear.py
@@ -33,7 +33,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.summary import summary
 from tensorflow.python.training import ftrl
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
 # The default learning rate of 0.2 is a historical artifact of the initial
@@ -164,7 +164,7 @@ def _linear_model_fn(features, labels, mode, head, feature_columns, optimizer,
         logits=logits)
 
 
-@tf_export('estimator.LinearClassifier')
+@estimator_export('estimator.LinearClassifier')
 class LinearClassifier(estimator.Estimator):
   """Linear classifier model.
 
@@ -317,7 +317,7 @@ class LinearClassifier(estimator.Estimator):
         warm_start_from=warm_start_from)
 
 
-@tf_export('estimator.LinearRegressor')
+@estimator_export('estimator.LinearRegressor')
 class LinearRegressor(estimator.Estimator):
   """An estimator for TensorFlow Linear regression problems.
 
diff --git a/tensorflow/python/estimator/canned/parsing_utils.py b/tensorflow/python/estimator/canned/parsing_utils.py
index 74e5e5a1be..1ae0f1e9f7 100644
--- a/tensorflow/python/estimator/canned/parsing_utils.py
+++ b/tensorflow/python/estimator/canned/parsing_utils.py
@@ -23,10 +23,10 @@ import six
 from tensorflow.python.feature_column import feature_column as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
-@tf_export('estimator.classifier_parse_example_spec')
+@estimator_export('estimator.classifier_parse_example_spec')
 def classifier_parse_example_spec(feature_columns,
                                   label_key,
                                   label_dtype=dtypes.int64,
@@ -166,7 +166,7 @@ def classifier_parse_example_spec(feature_columns,
   return parsing_spec
 
 
-@tf_export('estimator.regressor_parse_example_spec')
+@estimator_export('estimator.regressor_parse_example_spec')
 def regressor_parse_example_spec(feature_columns,
                                  label_key,
                                  label_dtype=dtypes.float32,
diff --git a/tensorflow/python/estimator/estimator.py b/tensorflow/python/estimator/estimator.py
index 4be1af1e66..41c25f1c73 100644
--- a/tensorflow/python/estimator/estimator.py
+++ b/tensorflow/python/estimator/estimator.py
@@ -66,14 +66,14 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util import function_utils
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
 _VALID_MODEL_FN_ARGS = set(
     ['features', 'labels', 'mode', 'params', 'self', 'config'])
 
 
-@tf_export('estimator.Estimator')
+@estimator_export('estimator.Estimator')
 class Estimator(object):
   """Estimator class to train and evaluate TensorFlow models.
 
@@ -566,7 +566,8 @@ class Estimator(object):
     allowed_overrides = set([
         '_call_input_fn', '_create_global_step',
         '_convert_train_steps_to_hooks', '_convert_eval_steps_to_hooks',
-        '_tf_api_names', '_validate_features_in_predict_input',
+        '_tf_api_names', '_estimator_api_names', '_estimator_api_constants',
+        '_validate_features_in_predict_input',
         '_call_model_fn', '_add_meta_graph_for_mode'
     ])
     estimator_members = set([m for m in Estimator.__dict__.keys()
@@ -1634,11 +1635,12 @@ def _has_dataset_or_queue_runner(maybe_tensor):
   # Now, check queue.
   return ops.get_default_graph().get_collection(ops.GraphKeys.QUEUE_RUNNERS)
 
+
 VocabInfo = warm_starting_util.VocabInfo  # pylint: disable=invalid-name
-tf_export('estimator.VocabInfo', allow_multiple_exports=True)(VocabInfo)
+estimator_export('estimator.VocabInfo')(VocabInfo)
 
 
-@tf_export('estimator.WarmStartSettings')
+@estimator_export('estimator.WarmStartSettings')
 class WarmStartSettings(
     collections.namedtuple('WarmStartSettings', [
         'ckpt_to_initialize_from',
diff --git a/tensorflow/python/estimator/export/export.py b/tensorflow/python/estimator/export/export.py
index ff19a0a7f4..010c0f3f59 100644
--- a/tensorflow/python/estimator/export/export.py
+++ b/tensorflow/python/estimator/export/export.py
@@ -34,7 +34,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 _SINGLE_FEATURE_DEFAULT_NAME = 'feature'
 _SINGLE_RECEIVER_DEFAULT_NAME = 'input'
@@ -93,7 +93,7 @@ def _check_tensor_key(name, error_label='feature'):
     raise ValueError('{} keys must be strings: {}.'.format(error_label, name))
 
 
-@tf_export('estimator.export.ServingInputReceiver')
+@estimator_export('estimator.export.ServingInputReceiver')
 class ServingInputReceiver(
     collections.namedtuple(
         'ServingInputReceiver',
@@ -161,7 +161,7 @@ class ServingInputReceiver(
         receiver_tensors_alternatives=receiver_tensors_alternatives)
 
 
-@tf_export('estimator.export.TensorServingInputReceiver')
+@estimator_export('estimator.export.TensorServingInputReceiver')
 class TensorServingInputReceiver(
     collections.namedtuple(
         'TensorServingInputReceiver',
@@ -263,7 +263,7 @@ class SupervisedInputReceiver(
         receiver_tensors=receiver_tensors)
 
 
-@tf_export('estimator.export.build_parsing_serving_input_receiver_fn')
+@estimator_export('estimator.export.build_parsing_serving_input_receiver_fn')
 def build_parsing_serving_input_receiver_fn(feature_spec,
                                             default_batch_size=None):
   """Build a serving_input_receiver_fn expecting fed tf.Examples.
@@ -313,7 +313,7 @@ def _placeholders_from_receiver_tensors_dict(input_vals,
   }
 
 
-@tf_export('estimator.export.build_raw_serving_input_receiver_fn')
+@estimator_export('estimator.export.build_raw_serving_input_receiver_fn')
 def build_raw_serving_input_receiver_fn(features, default_batch_size=None):
   """Build a serving_input_receiver_fn expecting feature Tensors.
 
diff --git a/tensorflow/python/estimator/export/export_output.py b/tensorflow/python/estimator/export/export_output.py
index d387ea2940..6c26d29985 100644
--- a/tensorflow/python/estimator/export/export_output.py
+++ b/tensorflow/python/estimator/export/export_output.py
@@ -26,10 +26,10 @@ import six
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_def_utils
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
-@tf_export('estimator.export.ExportOutput')
+@estimator_export('estimator.export.ExportOutput')
 class ExportOutput(object):
   """Represents an output of a model that can be served.
 
@@ -100,7 +100,7 @@ class ExportOutput(object):
     return output_dict
 
 
-@tf_export('estimator.export.ClassificationOutput')
+@estimator_export('estimator.export.ClassificationOutput')
 class ClassificationOutput(ExportOutput):
   """Represents the output of a classification head.
 
@@ -169,7 +169,7 @@ class ClassificationOutput(ExportOutput):
         examples, self.classes, self.scores)
 
 
-@tf_export('estimator.export.RegressionOutput')
+@estimator_export('estimator.export.RegressionOutput')
 class RegressionOutput(ExportOutput):
   """Represents the output of a regression head."""
 
@@ -202,7 +202,7 @@ class RegressionOutput(ExportOutput):
     return signature_def_utils.regression_signature_def(examples, self.value)
 
 
-@tf_export('estimator.export.PredictOutput')
+@estimator_export('estimator.export.PredictOutput')
 class PredictOutput(ExportOutput):
   """Represents the output of a generic prediction head.
 
diff --git a/tensorflow/python/estimator/exporter.py b/tensorflow/python/estimator/exporter.py
index 5981fa59b7..7cdf840c97 100644
--- a/tensorflow/python/estimator/exporter.py
+++ b/tensorflow/python/estimator/exporter.py
@@ -28,10 +28,10 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.summary import summary_iterator
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
-@tf_export('estimator.Exporter')
+@estimator_export('estimator.Exporter')
 class Exporter(object):
   """A class representing a type of model export."""
 
@@ -172,7 +172,7 @@ def _verify_compre_fn_args(compare_fn):
                      (compare_fn, non_valid_args))
 
 
-@tf_export('estimator.BestExporter')
+@estimator_export('estimator.BestExporter')
 class BestExporter(Exporter):
   """This class exports the serving graph and checkpoints of the best models.
 
@@ -367,7 +367,7 @@ class BestExporter(Exporter):
     return best_eval_result
 
 
-@tf_export('estimator.FinalExporter')
+@estimator_export('estimator.FinalExporter')
 class FinalExporter(Exporter):
   """This class exports the serving graph and checkpoints in the end.
 
@@ -418,7 +418,7 @@ class FinalExporter(Exporter):
                                              is_the_final_export)
 
 
-@tf_export('estimator.LatestExporter')
+@estimator_export('estimator.LatestExporter')
 class LatestExporter(Exporter):
   """This class regularly exports the serving graph and checkpoints.
 
diff --git a/tensorflow/python/estimator/inputs/numpy_io.py b/tensorflow/python/estimator/inputs/numpy_io.py
index a6f4712910..035c7c148c 100644
--- a/tensorflow/python/estimator/inputs/numpy_io.py
+++ b/tensorflow/python/estimator/inputs/numpy_io.py
@@ -24,7 +24,7 @@ import numpy as np
 from six import string_types
 
 from tensorflow.python.estimator.inputs.queues import feeding_functions
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 # Key name to pack the target into dict of `features`. See
 # `_get_unique_target_key` for details.
@@ -87,7 +87,7 @@ def _validate_and_convert_features(x):
   return ordered_dict_data
 
 
-@tf_export('estimator.inputs.numpy_input_fn')
+@estimator_export('estimator.inputs.numpy_input_fn')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/python/estimator/inputs/pandas_io.py b/tensorflow/python/estimator/inputs/pandas_io.py
index bd06843021..938e244fb3 100644
--- a/tensorflow/python/estimator/inputs/pandas_io.py
+++ b/tensorflow/python/estimator/inputs/pandas_io.py
@@ -21,7 +21,7 @@ from __future__ import print_function
 
 import numpy as np
 from tensorflow.python.estimator.inputs.queues import feeding_functions
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 try:
   # pylint: disable=g-import-not-at-top
@@ -35,7 +35,7 @@ except ImportError:
   HAS_PANDAS = False
 
 
-@tf_export('estimator.inputs.pandas_input_fn')
+@estimator_export('estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/python/estimator/model_fn.py b/tensorflow/python/estimator/model_fn.py
index 3edf9fe940..c60c7f63ba 100644
--- a/tensorflow/python/estimator/model_fn.py
+++ b/tensorflow/python/estimator/model_fn.py
@@ -32,10 +32,10 @@ from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import nest
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
-@tf_export('estimator.ModeKeys')
+@estimator_export('estimator.ModeKeys')
 class ModeKeys(object):
   """Standard names for model modes.
 
@@ -62,7 +62,7 @@ EXPORT_TAG_MAP = {
 }
 
 
-@tf_export('estimator.EstimatorSpec')
+@estimator_export('estimator.EstimatorSpec')
 class EstimatorSpec(
     collections.namedtuple('EstimatorSpec', [
         'mode', 'predictions', 'loss', 'train_op', 'eval_metric_ops',
diff --git a/tensorflow/python/estimator/run_config.py b/tensorflow/python/estimator/run_config.py
index c7707be839..b948ce96e0 100644
--- a/tensorflow/python/estimator/run_config.py
+++ b/tensorflow/python/estimator/run_config.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat_internal
 from tensorflow.python.util import function_utils
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 
 _USE_DEFAULT = object()
@@ -296,7 +296,7 @@ class TaskType(object):
   EVALUATOR = 'evaluator'
 
 
-@tf_export('estimator.RunConfig')
+@estimator_export('estimator.RunConfig')
 class RunConfig(object):
   """This class specifies the configurations for an `Estimator` run."""
 
diff --git a/tensorflow/python/estimator/training.py b/tensorflow/python/estimator/training.py
index fb6a68b4f7..1572af579b 100644
--- a/tensorflow/python/estimator/training.py
+++ b/tensorflow/python/estimator/training.py
@@ -35,7 +35,7 @@ from tensorflow.python.training import basic_session_run_hooks
 from tensorflow.python.training import server_lib
 from tensorflow.python.training import session_run_hook
 from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import tf_export
+from tensorflow.python.util.tf_export import estimator_export
 
 _MAX_DELAY_SECS = 60
 _DELAY_SECS_PER_WORKER = 5
@@ -115,7 +115,7 @@ def _is_google_env():
   return tf_config.get(_ENVIRONMENT_KEY) == _ENVIRONMENT_GOOGLE_VALUE
 
 
-@tf_export('estimator.TrainSpec')
+@estimator_export('estimator.TrainSpec')
 class TrainSpec(
     collections.namedtuple('TrainSpec', ['input_fn', 'max_steps', 'hooks'])):
   """Configuration for the "train" part for the `train_and_evaluate` call.
@@ -167,7 +167,7 @@ class TrainSpec(
         cls, input_fn=input_fn, max_steps=max_steps, hooks=hooks)
 
 
-@tf_export('estimator.EvalSpec')
+@estimator_export('estimator.EvalSpec')
 class EvalSpec(
     collections.namedtuple('EvalSpec', [
         'input_fn', 'steps', 'name', 'hooks', 'exporters', 'start_delay_secs',
@@ -263,7 +263,7 @@ class EvalSpec(
         throttle_secs=throttle_secs)
 
 
-@tf_export('estimator.train_and_evaluate')
+@estimator_export('estimator.train_and_evaluate')
 def train_and_evaluate(estimator, train_spec, eval_spec):
   """Train and evaluate the `estimator`.
 
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index bf3961c692..e154ffb68a 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -41,17 +41,35 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+import functools
 import sys
 
 from tensorflow.python.util import tf_decorator
 
+ESTIMATOR_API_NAME = 'estimator'
+TENSORFLOW_API_NAME = 'tensorflow'
+
+_Attributes = collections.namedtuple(
+    'ExportedApiAttributes', ['names', 'constants'])
+
+# Attribute values must be unique to each API.
+API_ATTRS = {
+    TENSORFLOW_API_NAME: _Attributes(
+        '_tf_api_names',
+        '_tf_api_constants'),
+    ESTIMATOR_API_NAME: _Attributes(
+        '_estimator_api_names',
+        '_estimator_api_constants')
+}
+
 
 class SymbolAlreadyExposedError(Exception):
   """Raised when adding API names to symbol that already has API names."""
   pass
 
 
-class tf_export(object):  # pylint: disable=invalid-name
+class api_export(object):  # pylint: disable=invalid-name
   """Provides ways to export symbols to the TensorFlow API."""
 
   def __init__(self, *args, **kwargs):
@@ -63,15 +81,12 @@ class tf_export(object):  # pylint: disable=invalid-name
           overrides: List of symbols that this is overriding
           (those overrided api exports will be removed). Note: passing overrides
           has no effect on exporting a constant.
-          allow_multiple_exports: Allows exporting the same symbol multiple
-          times with multiple `tf_export` usages. Prefer however, to list all
-          of the exported names in a single `tf_export` usage when possible.
-
+          api_name: Name of the API you want to generate (e.g. `tensorflow` or
+          `estimator`). Default is `tensorflow`.
     """
     self._names = args
+    self._api_name = kwargs.get('api_name', TENSORFLOW_API_NAME)
     self._overrides = kwargs.get('overrides', [])
-    self._allow_multiple_exports = kwargs.get(
-        'allow_multiple_exports', False)
 
   def __call__(self, func):
     """Calls this decorator.
@@ -86,25 +101,24 @@ class tf_export(object):  # pylint: disable=invalid-name
       SymbolAlreadyExposedError: Raised when a symbol already has API names
         and kwarg `allow_multiple_exports` not set.
     """
+    api_names_attr = API_ATTRS[self._api_name].names
+
     # Undecorate overridden names
     for f in self._overrides:
       _, undecorated_f = tf_decorator.unwrap(f)
-      del undecorated_f._tf_api_names  # pylint: disable=protected-access
+      delattr(undecorated_f, api_names_attr)
 
     _, undecorated_func = tf_decorator.unwrap(func)
 
     # Check for an existing api. We check if attribute name is in
     # __dict__ instead of using hasattr to verify that subclasses have
     # their own _tf_api_names as opposed to just inheriting it.
-    if '_tf_api_names' in undecorated_func.__dict__:
-      if self._allow_multiple_exports:
-        undecorated_func._tf_api_names += self._names  # pylint: disable=protected-access
-      else:
-        raise SymbolAlreadyExposedError(
-            'Symbol %s is already exposed as %s.' %
-            (undecorated_func.__name__, undecorated_func._tf_api_names))  # pylint: disable=protected-access
-    else:
-      undecorated_func._tf_api_names = self._names  # pylint: disable=protected-access
+    if api_names_attr in undecorated_func.__dict__:
+      raise SymbolAlreadyExposedError(
+          'Symbol %s is already exposed as %s.' %
+          (undecorated_func.__name__, getattr(
+              undecorated_func, api_names_attr)))  # pylint: disable=protected-access
+    setattr(undecorated_func, api_names_attr, self._names)
     return func
 
   def export_constant(self, module_name, name):
@@ -126,8 +140,12 @@ class tf_export(object):  # pylint: disable=invalid-name
       name: (string) Current constant name.
     """
     module = sys.modules[module_name]
-    if not hasattr(module, '_tf_api_constants'):
-      module._tf_api_constants = []  # pylint: disable=protected-access
+    if not hasattr(module, API_ATTRS[self._api_name].constants):
+      setattr(module, API_ATTRS[self._api_name].constants, [])
     # pylint: disable=protected-access
-    module._tf_api_constants.append((self._names, name))
+    getattr(module, API_ATTRS[self._api_name].constants).append(
+        (self._names, name))
+
 
+tf_export = functools.partial(api_export, api_name=TENSORFLOW_API_NAME)
+estimator_export = functools.partial(tf_export, api_name=ESTIMATOR_API_NAME)
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index ace3f054ba..b9e26ecb33 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -128,13 +128,6 @@ class ValidateExportTest(test.TestCase):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
-  def testEAllowMultipleExports(self):
-    _test_function._tf_api_names = ['name1', 'name2']
-    tf_export.tf_export('nameRed', 'nameBlue', allow_multiple_exports=True)(
-        _test_function)
-    self.assertEquals(['name1', 'name2', 'nameRed', 'nameBlue'],
-                      _test_function._tf_api_names)
-
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/tools/api/generator/api_gen.bzl b/tensorflow/tools/api/generator/api_gen.bzl
index fe3e4d1434..41713a94ec 100644
--- a/tensorflow/tools/api/generator/api_gen.bzl
+++ b/tensorflow/tools/api/generator/api_gen.bzl
@@ -11,9 +11,6 @@ TENSORFLOW_API_INIT_FILES = [
     "distributions/__init__.py",
     "distributions/bijectors/__init__.py",
     "errors/__init__.py",
-    "estimator/__init__.py",
-    "estimator/export/__init__.py",
-    "estimator/inputs/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
     "graph_util/__init__.py",
@@ -91,6 +88,16 @@ TENSORFLOW_API_INIT_FILES = [
     # END GENERATED FILES
 ]
 
+# keep sorted
+ESTIMATOR_API_INIT_FILES = [
+    # BEGIN GENERATED ESTIMATOR FILES
+    "__init__.py",
+    "estimator/__init__.py",
+    "estimator/export/__init__.py",
+    "estimator/inputs/__init__.py",
+    # END GENERATED ESTIMATOR FILES
+]
+
 # Creates a genrule that generates a directory structure with __init__.py
 # files that import all exported modules (i.e. modules with tf_export
 # decorators).
@@ -110,7 +117,9 @@ TENSORFLOW_API_INIT_FILES = [
 def gen_api_init_files(name,
                        output_files=TENSORFLOW_API_INIT_FILES,
                        root_init_template=None,
-                       srcs=[]):
+                       srcs=[],
+                       api_name="tensorflow",
+                       package="tensorflow.python"):
   root_init_template_flag = ""
   if root_init_template:
     root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
@@ -119,7 +128,8 @@ def gen_api_init_files(name,
       outs = output_files,
       cmd = (
           "$(location //tensorflow/tools/api/generator:create_python_api) " +
-          root_init_template_flag + " --apidir=$(@D) $(OUTS)"),
+          root_init_template_flag + " --apidir=$(@D) --apiname=" + api_name + " --package=" + package + " $(OUTS)"),
       srcs = srcs,
       tools = ["//tensorflow/tools/api/generator:create_python_api"],
+      visibility = ["//tensorflow:__pkg__"],
   )
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index de0a50ab44..972bdc84ae 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -25,10 +25,10 @@ import os
 import sys
 
 from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
 
+API_ATTRS = tf_export.API_ATTRS
 
-_API_CONSTANTS_ATTR = '_tf_api_constants'
-_API_NAMES_ATTR = '_tf_api_names'
 _DEFAULT_PACKAGE = 'tensorflow.python'
 _GENFILES_DIR_SUFFIX = 'genfiles/'
 _SYMBOLS_TO_SKIP_EXPLICITLY = {
@@ -154,12 +154,13 @@ __all__.extend([_s for _s in _names_with_underscore])
     return module_text_map
 
 
-def get_api_init_text(package):
+def get_api_init_text(package, api_name):
   """Get a map from destination module to __init__.py code for that module.
 
   Args:
     package: Base python package containing python with target tf_export
       decorators.
+    api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
 
   Returns:
     A dictionary where
@@ -187,7 +188,7 @@ def get_api_init_text(package):
       attr = getattr(module, module_contents_name)
 
       # If attr is _tf_api_constants attribute, then add the constants.
-      if module_contents_name == _API_CONSTANTS_ATTR:
+      if module_contents_name == API_ATTRS[api_name].constants:
         for exports, value in attr:
           for export in exports:
             names = export.split('.')
@@ -196,15 +197,12 @@ def get_api_init_text(package):
                 -1, dest_module, module.__name__, value, names[-1])
         continue
 
-      try:
-        _, attr = tf_decorator.unwrap(attr)
-      except Exception as e:
-        print('5555: %s %s' % (module, module_contents_name), file=sys.stderr)
-        raise e
+      _, attr = tf_decorator.unwrap(attr)
       # If attr is a symbol with _tf_api_names attribute, then
       # add import for it.
-      if hasattr(attr, '__dict__') and _API_NAMES_ATTR in attr.__dict__:
-        for export in attr._tf_api_names:  # pylint: disable=protected-access
+      if (hasattr(attr, '__dict__') and
+          API_ATTRS[api_name].names in attr.__dict__):
+        for export in getattr(attr, API_ATTRS[api_name].names):  # pylint: disable=protected-access
           names = export.split('.')
           dest_module = '.'.join(names[:-1])
           module_code_builder.add_import(
@@ -241,7 +239,7 @@ def get_module(dir_path, relative_to_dir):
     relative_to_dir: Get module relative to this directory.
 
   Returns:
-    module that corresponds to the given directory.
+    Name of module that corresponds to the given directory.
   """
   dir_path = dir_path[len(relative_to_dir):]
   # Convert path separators to '/' for easier parsing below.
@@ -250,7 +248,7 @@ def get_module(dir_path, relative_to_dir):
 
 
 def create_api_files(
-    output_files, package, root_init_template, output_dir):
+    output_files, package, root_init_template, output_dir, api_name):
   """Creates __init__.py files for the Python API.
 
   Args:
@@ -262,6 +260,7 @@ def create_api_files(
       "#API IMPORTS PLACEHOLDER" comment in the template file will be replaced
       with imports.
     output_dir: output API root directory.
+    api_name: API you want to generate (e.g. `tensorflow` or `estimator`).
 
   Raises:
     ValueError: if an output file is not under api/ directory,
@@ -278,7 +277,7 @@ def create_api_files(
       os.makedirs(os.path.dirname(file_path))
     open(file_path, 'a').close()
 
-  module_text_map = get_api_init_text(package)
+  module_text_map = get_api_init_text(package, api_name)
 
   # Add imports to output files.
   missing_output_files = []
@@ -329,6 +328,10 @@ def main():
       help='Directory where generated output files are placed. '
            'gendir should be a prefix of apidir. Also, apidir '
            'should be a prefix of every directory in outputs.')
+  parser.add_argument(
+      '--apiname', required=True, type=str,
+      choices=API_ATTRS.keys(),
+      help='The API you want to generate.')
 
   args = parser.parse_args()
 
@@ -342,8 +345,8 @@ def main():
 
   # Populate `sys.modules` with modules containing tf_export().
   importlib.import_module(args.package)
-  create_api_files(
-      outputs, args.package, args.root_init_template, args.apidir)
+  create_api_files(outputs, args.package, args.root_init_template,
+                   args.apidir, args.apiname)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/api/generator/create_python_api_test.py b/tensorflow/tools/api/generator/create_python_api_test.py
index 986340cf6d..651ec9d040 100644
--- a/tensorflow/tools/api/generator/create_python_api_test.py
+++ b/tensorflow/tools/api/generator/create_python_api_test.py
@@ -57,7 +57,8 @@ class CreatePythonApiTest(test.TestCase):
 
   def testFunctionImportIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE)
+        package=create_python_api._DEFAULT_PACKAGE,
+        api_name='tensorflow')
     expected_import = (
         'from tensorflow.python.test_module '
         'import test_op as test_op1')
@@ -73,7 +74,8 @@ class CreatePythonApiTest(test.TestCase):
 
   def testClassImportIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE)
+        package=create_python_api._DEFAULT_PACKAGE,
+        api_name='tensorflow')
     expected_import = ('from tensorflow.python.test_module '
                        'import TestClass')
     self.assertTrue(
@@ -82,7 +84,8 @@ class CreatePythonApiTest(test.TestCase):
 
   def testConstantIsAdded(self):
     imports = create_python_api.get_api_init_text(
-        package=create_python_api._DEFAULT_PACKAGE)
+        package=create_python_api._DEFAULT_PACKAGE,
+        api_name='tensorflow')
     expected = ('from tensorflow.python.test_module '
                 'import _TEST_CONSTANT')
     self.assertTrue(expected in str(imports),
-- 
GitLab


From 0dab0f538b78b0a0f1ec4f7dc5fb3005b5efdc94 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 7 Jun 2018 12:07:18 -0700
Subject: [PATCH 444/610] Avoid unnecessary `DoneCallback` copies in
 functional_ops.cc.

PiperOrigin-RevId: 199674121
---
 tensorflow/core/kernels/functional_ops.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index e0d594fa25..e0be57f972 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -152,7 +152,7 @@ class IfOp : public AsyncOpKernel {
         : kernel_(kernel),
           ctx_(ctx),
           cond_(cond),
-          done_(done),
+          done_(std::move(done)),
           lib_(CHECK_NOTNULL(ctx_->function_library())) {
       SetRunOptions(ctx_, &opts_, true /* always_collect_stats */);
       for (int i = 1; i < ctx_->num_inputs(); ++i) {
@@ -174,9 +174,9 @@ class IfOp : public AsyncOpKernel {
               s = SetOutputs(kernel_, ctx_, rets_);
             }
             ctx_->SetStatus(s);
-            auto done = done_;
+            DoneCallback captured_done(std::move(done_));
             delete this;
-            done();
+            captured_done();
           });
     }
 
@@ -257,7 +257,7 @@ class WhileOp : public AsyncOpKernel {
           ctx_(ctx),
           cond_handle_(cond_handle),
           body_handle_(body_handle),
-          done_(done),
+          done_(std::move(done)),
           lib_(CHECK_NOTNULL(ctx_->function_library())) {
       SetRunOptions(ctx_, &opts_, false /* always_collect_stats */);
       for (int i = 0; i < ctx_->num_inputs(); ++i) {
-- 
GitLab


From 5c74172fa5bd9f2ae6275d536f70971810a40548 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 7 Jun 2018 12:20:28 -0700
Subject: [PATCH 445/610] Add features to TOCO Python API.

PiperOrigin-RevId: 199676295
---
 tensorflow/contrib/lite/python/convert.py     | 13 ++++++-
 tensorflow/contrib/lite/python/lite.py        | 12 ++++++-
 tensorflow/contrib/lite/python/lite_test.py   | 34 +++++++++++++++++++
 .../contrib/lite/python/tflite_convert.py     | 22 ++++++++++++
 tensorflow/contrib/lite/toco/python/BUILD     |  1 +
 .../lite/toco/python/toco_python_api.cc       | 13 ++++++-
 tensorflow/contrib/lite/toco/toco_flags.proto |  9 +++++
 7 files changed, 101 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index 08f3f8bf32..fce8ffb54a 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -124,7 +124,9 @@ def toco_convert(input_data,
                  reorder_across_fake_quant=False,
                  allow_custom_ops=False,
                  change_concat_input_ranges=False,
-                 quantize_weights=False):
+                 quantize_weights=False,
+                 dump_graphviz_dir=None,
+                 dump_graphviz_video=False):
   """Convert a model using TOCO from `input_format` to `output_format`.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
@@ -170,6 +172,12 @@ def toco_convert(input_data,
       weights followed by dequantize operations. Computation is still done in
       float, but reduces model size (at the cost of accuracy and latency).
       (default False)
+    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
+      stages of processing GraphViz .dot files. Preferred over
+      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
+      output file. (default None)
+    dump_graphviz_video: Boolean indicating whether to dump the graph after
+      every graph transformation. (default False)
 
   Returns:
     The converted data. For example if TFLite was the destination, then
@@ -193,6 +201,9 @@ def toco_convert(input_data,
   if default_ranges_stats:
     toco.default_ranges_min = default_ranges_stats[0]
     toco.default_ranges_max = default_ranges_stats[1]
+  if dump_graphviz_dir:
+    toco.dump_graphviz_dir = dump_graphviz_dir
+  toco.dump_graphviz_include_video = dump_graphviz_video
 
   model = _model_flags_pb2.ModelFlags()
   model.change_concat_input_ranges = change_concat_input_ranges
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index e3a2d19e05..4fb88c1ad6 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -96,6 +96,12 @@ class TocoConverter(object):
       weights followed by dequantize operations. Computation is still done in
       float, but reduces model size (at the cost of accuracy and latency).
       (default False)
+    dump_graphviz_dir: Full filepath of folder to dump the graphs at various
+      stages of processing GraphViz .dot files. Preferred over
+      --output_format=GRAPHVIZ_DOT in order to keep the requirements of the
+      output file. (default None)
+    dump_graphviz_video: Boolean indicating whether to dump the graph after
+      every graph transformation. (default False)
 
   Example usage:
 
@@ -138,6 +144,8 @@ class TocoConverter(object):
     self.change_concat_input_ranges = False
     self.allow_custom_ops = False
     self.quantize_weights = False
+    self.dump_graphviz_dir = None
+    self.dump_graphviz_video = False
 
   @classmethod
   def from_session(cls, sess, input_tensors, output_tensors):
@@ -308,7 +316,9 @@ class TocoConverter(object):
         reorder_across_fake_quant=self.reorder_across_fake_quant,
         change_concat_input_ranges=self.change_concat_input_ranges,
         allow_custom_ops=self.allow_custom_ops,
-        quantize_weights=self.quantize_weights)
+        quantize_weights=self.quantize_weights,
+        dump_graphviz_dir=self.dump_graphviz_dir,
+        dump_graphviz_video=self.dump_graphviz_video)
     return result
 
   def get_input_arrays(self):
diff --git a/tensorflow/contrib/lite/python/lite_test.py b/tensorflow/contrib/lite/python/lite_test.py
index b04caaf263..8c9d2c1651 100644
--- a/tensorflow/contrib/lite/python/lite_test.py
+++ b/tensorflow/contrib/lite/python/lite_test.py
@@ -220,6 +220,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
     self.assertEqual((0., 0.), output_details[0]['quantization'])
 
+  # TODO(nupurgarg): Verify value of contents in GraphViz.
   def testGraphviz(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -232,6 +233,39 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     graphviz_output = converter.convert()
     self.assertTrue(graphviz_output)
 
+  # TODO(nupurgarg): Verify value of contents in GraphViz.
+  def testDumpGraphviz(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    graphviz_dir = self.get_temp_dir()
+    converter.dump_graphviz_dir = graphviz_dir
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Ensure interpreter is able to allocate and check graphviz data.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    num_items_graphviz = len(os.listdir(graphviz_dir))
+    self.assertTrue(num_items_graphviz)
+
+    # Convert model and ensure model is not None.
+    converter = lite.TocoConverter.from_session(sess, [in_tensor], [out_tensor])
+    graphviz_dir = self.get_temp_dir()
+    converter.dump_graphviz_dir = graphviz_dir
+    converter.dump_graphviz_video = True
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Ensure graphviz folder has more data after using video flag.
+    num_items_graphviz_video = len(os.listdir(graphviz_dir))
+    self.assertTrue(num_items_graphviz_video > num_items_graphviz)
+
   def testInferenceInputType(self):
     in_tensor = array_ops.placeholder(shape=[1, 16, 16, 3], dtype=dtypes.uint8)
     out_tensor = in_tensor + in_tensor
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 4c215b62b2..492d2632fe 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -130,6 +130,10 @@ def _convert_model(flags):
     converter.allow_custom_ops = flags.allow_custom_ops
   if flags.quantize_weights:
     converter.quantize_weights = flags.quantize_weights
+  if flags.dump_graphviz_dir:
+    converter.dump_graphviz_dir = flags.dump_graphviz_dir
+  if flags.dump_graphviz_video:
+    converter.dump_graphviz_vode = flags.dump_graphviz_video
 
   # Convert model.
   output_data = converter.convert()
@@ -161,8 +165,12 @@ def _check_flags(flags, unparsed):
     output = ""
     for flag in unparsed:
       output += _get_message_unparsed(flag, "--input_file", "--graph_def_file")
+      output += _get_message_unparsed(flag, "--savedmodel_directory",
+                                      "--saved_model_dir")
       output += _get_message_unparsed(flag, "--std_value", "--std_dev_values")
       output += _get_message_unparsed(flag, "--batch_size", "--input_shapes")
+      output += _get_message_unparsed(flag, "--dump_graphviz",
+                                      "--dump_graphviz_dir")
     if output:
       raise ValueError(output)
 
@@ -322,6 +330,20 @@ def run_main(_):
             "provide these to the TensorFlow Lite runtime with a custom "
             "resolver. (default False)"))
 
+  # Logging flags.
+  parser.add_argument(
+      "--dump_graphviz_dir",
+      type=str,
+      help=("Full filepath of folder to dump the graphs at various stages of "
+            "processing GraphViz .dot files. Preferred over --output_format="
+            "GRAPHVIZ_DOT in order to keep the requirements of the output "
+            "file."))
+  parser.add_argument(
+      "--dump_graphviz_video",
+      action="store_true",
+      help=("Boolean indicating whether to dump the graph after every graph "
+            "transformation"))
+
   tflite_flags, unparsed = parser.parse_known_args(args=sys.argv[1:])
   try:
     _check_flags(tflite_flags, unparsed)
diff --git a/tensorflow/contrib/lite/toco/python/BUILD b/tensorflow/contrib/lite/toco/python/BUILD
index a954f1d6ba..93fe756a55 100644
--- a/tensorflow/contrib/lite/toco/python/BUILD
+++ b/tensorflow/contrib/lite/toco/python/BUILD
@@ -12,6 +12,7 @@ cc_library(
     deps = [
         "//tensorflow/contrib/lite/toco:model_flags_proto_cc",
         "//tensorflow/contrib/lite/toco:toco_flags_proto_cc",
+        "//tensorflow/contrib/lite/toco:toco_graphviz_dump_options",
         "//tensorflow/contrib/lite/toco:toco_port",
         "//tensorflow/contrib/lite/toco:toco_tooling",
         "//tensorflow/core:lib",
diff --git a/tensorflow/contrib/lite/toco/python/toco_python_api.cc b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
index 5b1db852b4..d93e104038 100644
--- a/tensorflow/contrib/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/contrib/lite/toco/python/toco_python_api.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/contrib/lite/toco/model_flags.pb.h"
 #include "tensorflow/contrib/lite/toco/python/toco_python_api.h"
 #include "tensorflow/contrib/lite/toco/toco_flags.pb.h"
+#include "tensorflow/contrib/lite/toco/toco_graphviz_dump_options.h"
 #include "tensorflow/contrib/lite/toco/toco_port.h"
 #include "tensorflow/contrib/lite/toco/toco_tooling.h"
 #include "tensorflow/contrib/lite/toco/toco_types.h"
@@ -62,7 +63,7 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   std::string input_contents_txt = ConvertArg(input_contents_txt_raw, &error);
   if (error) return nullptr;
 
-  // Use toco to produce new outputs
+  // Use TOCO to produce new outputs.
   toco::ModelFlags model_flags;
   if (!model_flags.ParseFromString(model_flags_proto_txt)) {
     LOG(FATAL) << "Model proto failed to parse." << std::endl;
@@ -71,6 +72,16 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
   if (!toco_flags.ParseFromString(toco_flags_proto_txt)) {
     LOG(FATAL) << "Toco proto failed to parse." << std::endl;
   }
+
+  auto& dump_options = *GraphVizDumpOptions::singleton();
+  if (toco_flags.has_dump_graphviz_dir()) {
+    dump_options.dump_graphviz = toco_flags.dump_graphviz_dir();
+  }
+  if (toco_flags.has_dump_graphviz_include_video()) {
+    dump_options.dump_graphviz_video = toco_flags.dump_graphviz_include_video();
+  }
+
+  // Convert model.
   std::unique_ptr<toco::Model> model =
       toco::Import(toco_flags, model_flags, input_contents_txt);
   toco::Transform(toco_flags, model.get());
diff --git a/tensorflow/contrib/lite/toco/toco_flags.proto b/tensorflow/contrib/lite/toco/toco_flags.proto
index 4fe57879fb..ad4e94ded9 100644
--- a/tensorflow/contrib/lite/toco/toco_flags.proto
+++ b/tensorflow/contrib/lite/toco/toco_flags.proto
@@ -174,4 +174,13 @@ message TocoFlags {
   // Computation is still done in float, but reduces model size (at the cost of
   // accuracy and latency).
   optional bool quantize_weights = 20 [default = false];
+
+  // Full filepath of folder to dump the graphs at various stages of processing
+  // GraphViz .dot files. Preferred over --output_format=GRAPHVIZ_DOT in order
+  // to keep the requirements of the output file.
+  optional string dump_graphviz_dir = 24;
+
+  // Boolean indicating whether to dump the graph after every graph
+  // transformation.
+  optional bool dump_graphviz_include_video = 25;
 }
-- 
GitLab


From 6f20926fb7a181c44cca6191eec8961040d83cd1 Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Thu, 7 Jun 2018 12:21:29 -0700
Subject: [PATCH 446/610] [XLA] Don't de-emphasize copy nodes in graph dumps.

PiperOrigin-RevId: 199676435
---
 tensorflow/compiler/xla/service/hlo_graph_dumper.cc | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 61612bebd1..a6750460e5 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -975,7 +975,6 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
       }
       return kGreen;
     case HloOpcode::kConcatenate:
-    case HloOpcode::kCopy:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kGather:
     case HloOpcode::kPad:
@@ -997,6 +996,10 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
         return kWhite;
       }
       return kGreen;
+    case HloOpcode::kCopy:
+      // Emphasize copy nodes, which are either physical transposes (and thus
+      // significant), or copies of read-only buffers (and thus dead weight).
+      return kGreen;
     case HloOpcode::kConvolution:
     case HloOpcode::kDot:
     case HloOpcode::kFft:
-- 
GitLab


From 2857228ba6c7b357185e7a0af346f4fc93a10f74 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Thu, 7 Jun 2018 12:23:10 -0700
Subject: [PATCH 447/610] Misc fixes to benchmarks.

PiperOrigin-RevId: 199676652
---
 .../contrib/lite/profiling/profile_summarizer.cc      | 11 +++++++++--
 tensorflow/contrib/lite/tools/benchmark/BUILD         |  3 ++-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/profiling/profile_summarizer.cc b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
index 6f2c9cd2b3..45388b500c 100644
--- a/tensorflow/contrib/lite/profiling/profile_summarizer.cc
+++ b/tensorflow/contrib/lite/profiling/profile_summarizer.cc
@@ -85,11 +85,18 @@ OperatorDetails GetOperatorDetails(const tflite::Interpreter& interpreter,
   return details;
 }
 
+tensorflow::StatSummarizerOptions GetProfileSummarizerOptions() {
+  auto options = tensorflow::StatSummarizerOptions();
+  options.show_summary = true;
+  options.show_memory = false;
+  return options;
+}
+
 }  // namespace
 
 ProfileSummarizer::ProfileSummarizer()
-    : stats_calculator_(new ::tensorflow::StatsCalculator(
-          tensorflow::StatSummarizerOptions())) {}
+    : stats_calculator_(
+          new ::tensorflow::StatsCalculator(GetProfileSummarizerOptions())) {}
 
 void ProfileSummarizer::ProcessProfiles(
     const std::vector<const ProfileEvent*>& profile_stats,
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index c5aa27d07c..f918010e2b 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -6,6 +6,7 @@ licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 
 common_copts = ["-Wall"]
 
@@ -15,7 +16,7 @@ cc_binary(
         "benchmark_main.cc",
         "logging.h",
     ],
-    copts = common_copts,
+    copts = tflite_copts() + common_copts,
     linkopts = select({
         "//tensorflow:android": [
             "-pie",
-- 
GitLab


From 9639db8d18d979e98061504a2c6ee4bba0f74610 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 12:52:35 -0700
Subject: [PATCH 448/610] Add TransformDiagonal higher-order bijector to
 transform only the diagonal of a matrix.

PiperOrigin-RevId: 199680859
---
 tensorflow/contrib/distributions/BUILD        |  19 ++++
 .../bijectors/transform_diagonal_test.py      |  66 ++++++++++++
 .../python/ops/bijectors/__init__.py          |   2 +
 .../ops/bijectors/transform_diagonal.py       | 102 ++++++++++++++++++
 4 files changed, 189 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/transform_diagonal_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/transform_diagonal.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index d8baf49e81..61d4e90ea2 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -1254,6 +1254,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "transform_diagonal_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/transform_diagonal_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "weibull_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/transform_diagonal_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/transform_diagonal_test.py
new file mode 100644
index 0000000000..6428a68702
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/transform_diagonal_test.py
@@ -0,0 +1,66 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TransformDiagonal bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class TransformDiagonalBijectorTest(test.TestCase):
+  """Tests correctness of the TransformDiagonal bijector."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testBijector(self):
+    x = np.float32(np.random.randn(3, 4, 4))
+
+    y = x.copy()
+    for i in range(x.shape[0]):
+      np.fill_diagonal(y[i, :, :], np.exp(np.diag(x[i, :, :])))
+
+    exp = bijectors.Exp()
+    b = bijectors.TransformDiagonal(diag_bijector=exp)
+
+    y_ = self.evaluate(b.forward(x))
+    self.assertAllClose(y, y_)
+
+    x_ = self.evaluate(b.inverse(y))
+    self.assertAllClose(x, x_)
+
+    fldj = self.evaluate(b.forward_log_det_jacobian(x, event_ndims=2))
+    ildj = self.evaluate(b.inverse_log_det_jacobian(y, event_ndims=2))
+    self.assertAllEqual(
+        fldj,
+        self.evaluate(exp.forward_log_det_jacobian(
+            np.array([np.diag(x_mat) for x_mat in x]),
+            event_ndims=1)))
+    self.assertAllEqual(
+        ildj,
+        self.evaluate(exp.inverse_log_det_jacobian(
+            np.array([np.diag(y_mat) for y_mat in y]),
+            event_ndims=1)))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index 59b8cf1bb2..d97a1f0d30 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -43,6 +43,7 @@
 @@Softplus
 @@Softsign
 @@Square
+@@TransformDiagonal
 @@Weibull
 
 @@masked_autoregressive_default_template
@@ -83,6 +84,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered impo
 from tensorflow.contrib.distributions.python.ops.bijectors.softplus import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softsign import *
 from tensorflow.contrib.distributions.python.ops.bijectors.square import *
+from tensorflow.contrib.distributions.python.ops.bijectors.transform_diagonal import *
 from tensorflow.python.ops.distributions.bijector import *
 from tensorflow.python.ops.distributions.identity_bijector import Identity
 
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/transform_diagonal.py b/tensorflow/contrib/distributions/python/ops/bijectors/transform_diagonal.py
new file mode 100644
index 0000000000..65669fc2bf
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/transform_diagonal.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TransformDiagonal bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.distributions import bijector
+
+__all__ = [
+    "TransformDiagonal",
+]
+
+
+class TransformDiagonal(bijector.Bijector):
+  """Applies a Bijector to the diagonal of a matrix.
+
+  #### Example
+
+  ```python
+  b = tfb.TransformDiagonal(diag_bijector=tfb.Exp())
+
+  b.forward([[1., 0.],
+             [0., 1.]])
+  # ==> [[2.718, 0.],
+         [0., 2.718]]
+  ```
+
+  """
+
+  def __init__(self,
+               diag_bijector,
+               validate_args=False,
+               name="transform_diagonal"):
+    """Instantiates the `TransformDiagonal` bijector.
+
+    Args:
+      diag_bijector: `Bijector` instance used to transform the diagonal.
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+      name: Python `str` name given to ops managed by this object.
+    """
+    self._diag_bijector = diag_bijector
+    super(TransformDiagonal, self).__init__(
+        forward_min_event_ndims=2,
+        inverse_min_event_ndims=2,
+        validate_args=validate_args,
+        name=name)
+
+  def _forward(self, x):
+    diag = self._diag_bijector.forward(array_ops.matrix_diag_part(x))
+    return array_ops.matrix_set_diag(x, diag)
+
+  def _inverse(self, y):
+    diag = self._diag_bijector.inverse(array_ops.matrix_diag_part(y))
+    return array_ops.matrix_set_diag(y, diag)
+
+  def _forward_log_det_jacobian(self, x):
+    # We formulate the Jacobian with respect to the flattened matrices
+    # `vec(x)` and `vec(y)`. Suppose for notational convenience that
+    # the first `n` entries of `vec(x)` are the diagonal of `x`, and
+    # the remaining `n**2-n` entries are the off-diagonals in
+    # arbitrary order. Then the Jacobian is a block-diagonal matrix,
+    # with the Jacobian of the diagonal bijector in the first block,
+    # and the identity Jacobian for the remaining entries (since this
+    # bijector acts as the identity on non-diagonal entries):
+    #
+    # J_vec(x) (vec(y)) =
+    # -------------------------------
+    # | J_diag(x) (diag(y))      0  | n entries
+    # |                             |
+    # | 0                        I  | n**2-n entries
+    # -------------------------------
+    #   n                     n**2-n
+    #
+    # Since the log-det of the second (identity) block is zero, the
+    # overall log-det-jacobian is just the log-det of first block,
+    # from the diagonal bijector.
+    #
+    # Note that for elementwise operations (exp, softplus, etc) the
+    # first block of the Jacobian will itself be a diagonal matrix,
+    # but our implementation does not require this to be true.
+    return self._diag_bijector.forward_log_det_jacobian(
+        array_ops.matrix_diag_part(x), event_ndims=1)
+
+  def _inverse_log_det_jacobian(self, y):
+    return self._diag_bijector.inverse_log_det_jacobian(
+        array_ops.matrix_diag_part(y), event_ndims=1)
-- 
GitLab


From 09c25a87cf321f317662f67d1b08deb3585e9abe Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Thu, 7 Jun 2018 12:55:59 -0700
Subject: [PATCH 449/610] Update documentation.

PiperOrigin-RevId: 199681316
---
 .../contrib/lite/tools/benchmark/README.md    | 104 ++++++++----------
 1 file changed, 45 insertions(+), 59 deletions(-)

diff --git a/tensorflow/contrib/lite/tools/benchmark/README.md b/tensorflow/contrib/lite/tools/benchmark/README.md
index e6f333aa5b..2788f76faf 100644
--- a/tensorflow/contrib/lite/tools/benchmark/README.md
+++ b/tensorflow/contrib/lite/tools/benchmark/README.md
@@ -93,80 +93,66 @@ This compiles TFLite with profiling enabled, now you can run the benchmark binar
 
 ============================== Run Order ==============================
 	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
-	                 CONV_2D	    0.000	    9.132	    9.132	  0.121%	  0.121%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
-	       DEPTHWISE_CONV_2D	    9.135	    3.280	    3.280	  0.043%	  0.165%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6]
-	                 CONV_2D	   12.419	    6.877	    6.877	  0.091%	  0.256%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   19.299	    1.708	    1.708	  0.023%	  0.278%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_depthwise/Relu6]
-	                 CONV_2D	   21.012	    4.162	    4.162	  0.055%	  0.334%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   25.177	    3.520	    3.520	  0.047%	  0.380%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_depthwise/Relu6]
-	                 CONV_2D	   28.701	   10.218	   10.218	  0.136%	  0.516%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   38.922	    0.827	    0.827	  0.011%	  0.527%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_depthwise/Relu6]
-	                 CONV_2D	   39.752	    1.401	    1.401	  0.019%	  0.545%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   41.156	    1.290	    1.290	  0.017%	  0.563%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6]
-	                 CONV_2D	   42.448	    5.995	    5.995	  0.080%	  0.642%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   48.445	    0.409	    0.409	  0.005%	  0.647%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
-	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  0.729%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   55.026	    0.629	    0.629	  0.008%	  0.738%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
-	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.823%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   62.124	    0.647	    0.647	  0.009%	  0.832%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
-	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  1.026%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   77.444	    0.635	    0.635	  0.008%	  1.035%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
-	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  1.130%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   85.270	    0.646	    0.646	  0.009%	  1.139%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6]
-	                 CONV_2D	   85.918	    9.529	    9.529	  0.126%	  1.265%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   95.451	    0.628	    0.628	  0.008%	  1.273%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6]
-	                 CONV_2D	   96.081	    2.077	    2.077	  0.028%	  1.301%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   98.162	    0.168	    0.168	  0.002%	  1.303%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6]
-	                 CONV_2D	   98.332	    1.007	    1.007	  0.013%	  1.317%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   99.342	    0.288	    0.288	  0.004%	  1.320%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6]
-	                 CONV_2D	   99.632	    8.197	    8.197	  0.109%	  1.429%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
-	         AVERAGE_POOL_2D	  107.832	    0.045	    0.045	  0.001%	  1.430%	     0.000	        0	[MobilenetV1/Logits/AvgPool_1a/AvgPool]
-	                 CONV_2D	  107.878	    0.325	    0.325	  0.004%	  1.434%	     0.000	        0	[MobilenetV1/Logits/Conv2d_1c_1x1/BiasAdd]
-	                 RESHAPE	  108.206	    0.003	    0.003	  0.000%	  1.434%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
-	                 SOFTMAX	  108.211	    0.038	    0.038	  0.001%	  1.434%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
+	                 CONV_2D	    0.000	    4.269	    4.269	  0.107%	  0.107%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
+	       DEPTHWISE_CONV_2D	    4.270	    2.150	    2.150	  0.054%	  0.161%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_depthwise/Relu6]
+	                 CONV_2D	    6.421	    6.107	    6.107	  0.153%	  0.314%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   12.528	    1.366	    1.366	  0.034%	  0.348%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_depthwise/Relu6]
+	                 CONV_2D	   13.895	    4.195	    4.195	  0.105%	  0.454%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_2_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   18.091	    1.260	    1.260	  0.032%	  0.485%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_depthwise/Relu6]
+	                 CONV_2D	   19.352	    6.652	    6.652	  0.167%	  0.652%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   26.005	    0.698	    0.698	  0.018%	  0.670%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_depthwise/Relu6]
+	                 CONV_2D	   26.703	    3.344	    3.344	  0.084%	  0.754%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_4_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   30.047	    0.646	    0.646	  0.016%	  0.770%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_depthwise/Relu6]
+	                 CONV_2D	   30.694	    5.800	    5.800	  0.145%	  0.915%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   36.495	    0.331	    0.331	  0.008%	  0.924%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+	                 CONV_2D	   36.826	    2.838	    2.838	  0.071%	  0.995%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   39.665	    0.439	    0.439	  0.011%	  1.006%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
+	                 CONV_2D	   40.105	    5.293	    5.293	  0.133%	  1.139%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   45.399	    0.352	    0.352	  0.009%	  1.147%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
+	                 CONV_2D	   45.752	    5.322	    5.322	  0.133%	  1.281%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   51.075	    0.357	    0.357	  0.009%	  1.290%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
+	                 CONV_2D	   51.432	    5.693	    5.693	  0.143%	  1.433%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   57.126	    0.366	    0.366	  0.009%	  1.442%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_depthwise/Relu6]
+	                 CONV_2D	   57.493	    5.472	    5.472	  0.137%	  1.579%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   62.966	    0.364	    0.364	  0.009%	  1.588%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_depthwise/Relu6]
+	                 CONV_2D	   63.330	    5.404	    5.404	  0.136%	  1.724%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   68.735	    0.155	    0.155	  0.004%	  1.728%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_depthwise/Relu6]
+	                 CONV_2D	   68.891	    2.970	    2.970	  0.074%	  1.802%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_12_pointwise/Relu6]
+	       DEPTHWISE_CONV_2D	   71.862	    0.206	    0.206	  0.005%	  1.807%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_depthwise/Relu6]
+	                 CONV_2D	   72.069	    5.888	    5.888	  0.148%	  1.955%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	         AVERAGE_POOL_2D	   77.958	    0.036	    0.036	  0.001%	  1.956%	     0.000	        0	[MobilenetV1/Logits/AvgPool_1a/AvgPool]
+	                 CONV_2D	   77.994	    1.445	    1.445	  0.036%	  1.992%	     0.000	        0	[MobilenetV1/Logits/Conv2d_1c_1x1/BiasAdd]
+	                 RESHAPE	   79.440	    0.002	    0.002	  0.000%	  1.992%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
+	                 SOFTMAX	   79.443	    0.029	    0.029	  0.001%	  1.993%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
 
 ============================== Top by Computation Time ==============================
 	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
-	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  0.195%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
-	                 CONV_2D	   28.701	   10.218	   10.218	  0.136%	  0.330%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
-	                 CONV_2D	   85.918	    9.529	    9.529	  0.126%	  0.456%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
-	                 CONV_2D	    0.000	    9.132	    9.132	  0.121%	  0.578%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
-	                 CONV_2D	   99.632	    8.197	    8.197	  0.109%	  0.686%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
-	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  0.782%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
-	                 CONV_2D	   12.419	    6.877	    6.877	  0.091%	  0.873%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
-	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.958%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
-	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  1.040%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
-	                 CONV_2D	   42.448	    5.995	    5.995	  0.080%	  1.120%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
-
-============================== Top by Memory Use ==============================
-	             [node type]	  [start]	  [first]	 [avg ms]	     [%]	  [cdf%]	  [mem KB]	[times called]	[Name]
-	                 SOFTMAX	  108.211	    0.038	    0.038	  0.001%	  0.001%	     0.000	        0	[MobilenetV1/Predictions/Softmax]
-	                 RESHAPE	  108.206	    0.003	    0.003	  0.000%	  0.001%	     0.000	        0	[MobilenetV1/Predictions/Reshape]
-	                 CONV_2D	   78.081	    7.186	    7.186	  0.095%	  0.096%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   77.444	    0.635	    0.635	  0.008%	  0.104%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_depthwise/Relu6]
-	                 CONV_2D	   62.774	   14.666	   14.666	  0.195%	  0.299%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   62.124	    0.647	    0.647	  0.009%	  0.307%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_depthwise/Relu6]
-	                 CONV_2D	   55.656	    6.464	    6.464	  0.086%	  0.393%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   55.026	    0.629	    0.629	  0.008%	  0.401%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_depthwise/Relu6]
-	                 CONV_2D	   48.856	    6.167	    6.167	  0.082%	  0.483%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_pointwise/Relu6]
-	       DEPTHWISE_CONV_2D	   48.445	    0.409	    0.409	  0.005%	  0.489%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_6_depthwise/Relu6]
+	                 CONV_2D	   19.352	    6.652	    6.652	  0.167%	  0.167%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_3_pointwise/Relu6]
+	                 CONV_2D	    6.421	    6.107	    6.107	  0.153%	  0.320%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_1_pointwise/Relu6]
+	                 CONV_2D	   72.069	    5.888	    5.888	  0.148%	  0.468%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_13_pointwise/Relu6]
+	                 CONV_2D	   30.694	    5.800	    5.800	  0.145%	  0.613%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_5_pointwise/Relu6]
+	                 CONV_2D	   51.432	    5.693	    5.693	  0.143%	  0.756%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_9_pointwise/Relu6]
+	                 CONV_2D	   57.493	    5.472	    5.472	  0.137%	  0.893%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_10_pointwise/Relu6]
+	                 CONV_2D	   63.330	    5.404	    5.404	  0.136%	  1.029%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_11_pointwise/Relu6]
+	                 CONV_2D	   45.752	    5.322	    5.322	  0.133%	  1.162%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_8_pointwise/Relu6]
+	                 CONV_2D	   40.105	    5.293	    5.293	  0.133%	  1.295%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_7_pointwise/Relu6]
+	                 CONV_2D	    0.000	    4.269	    4.269	  0.107%	  1.402%	     0.000	        0	[MobilenetV1/MobilenetV1/Conv2d_0/Relu6]
 
 Number of nodes executed: 31
 ============================== Summary by node type ==============================
 	             [Node type]	  [count]	  [avg ms]	    [avg %]	    [cdf %]	  [mem KB]	[times called]
-	                 CONV_2D	       15	     1.861	    86.679%	    86.679%	     0.000	        0
-	       DEPTHWISE_CONV_2D	       13	     0.286	    13.321%	   100.000%	     0.000	        0
+	                 CONV_2D	       15	     1.406	    89.270%	    89.270%	     0.000	        0
+	       DEPTHWISE_CONV_2D	       13	     0.169	    10.730%	   100.000%	     0.000	        0
 	                 SOFTMAX	        1	     0.000	     0.000%	   100.000%	     0.000	        0
 	                 RESHAPE	        1	     0.000	     0.000%	   100.000%	     0.000	        0
 	         AVERAGE_POOL_2D	        1	     0.000	     0.000%	   100.000%	     0.000	        0
 
-Timings (microseconds): count=50 first=108164 curr=128308 min=102850 max=197072 avg=150805 std=24368
+Timings (microseconds): count=50 first=79449 curr=81350 min=77385 max=88213 avg=79732 std=1929
 Memory (bytes): count=0
 31 nodes observed
 
 
-Average inference timings in us: Warmup: 135310, Init: 12123, no stats: 150988
-
+Average inference timings in us: Warmup: 83235, Init: 38467, no stats: 79760.9
 ```
 
 
-- 
GitLab


From 5174b67f70645210429db837df3047c7d52637bf Mon Sep 17 00:00:00 2001
From: Kay Zhu <kayzhu@google.com>
Date: Thu, 7 Jun 2018 13:03:54 -0700
Subject: [PATCH 450/610] [TF:XLA] Introduce a new HostTensorToBorrowingLiteral
 path without the memcpy from Tensor to Literal, and use it in xla_helpers.

PiperOrigin-RevId: 199682452
---
 tensorflow/compiler/tf2xla/literal_util.cc   | 31 ++++++++++++++++++++
 tensorflow/compiler/tf2xla/literal_util.h    | 12 ++++++++
 tensorflow/compiler/tf2xla/xla_helpers.cc    | 11 ++++---
 tensorflow/compiler/xla/literal_util.cc      | 22 +++++++-------
 tensorflow/compiler/xla/literal_util.h       |  6 ++--
 tensorflow/compiler/xla/literal_util_test.cc |  4 +--
 6 files changed, 67 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/literal_util.cc b/tensorflow/compiler/tf2xla/literal_util.cc
index 43e1c1e9fe..db56b12837 100644
--- a/tensorflow/compiler/tf2xla/literal_util.cc
+++ b/tensorflow/compiler/tf2xla/literal_util.cc
@@ -40,6 +40,37 @@ Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal) {
   return Status::OK();
 }
 
+Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
+                                    xla::BorrowingLiteral* literal) {
+  xla::Shape xla_shape;
+  TF_RETURN_IF_ERROR(TensorShapeToXLAShape(host_tensor.dtype(),
+                                           host_tensor.shape(), &xla_shape));
+  *literal = xla::BorrowingLiteral(
+      static_cast<const char*>(DMAHelper::base(&host_tensor)), xla_shape);
+  return Status::OK();
+}
+
+Status HostTensorsToBorrowingLiteralTuple(
+    tensorflow::gtl::ArraySlice<Tensor> host_tensors,
+    xla::BorrowingLiteral* literal) {
+  std::vector<const char*> buf_ptrs;
+  buf_ptrs.reserve(host_tensors.size());
+  std::vector<xla::Shape> tensor_shapes(host_tensors.size());
+
+  for (int i = 0; i < host_tensors.size(); i++) {
+    // Validate runtime shapes and fail if it doesn't match the contract.
+    const Tensor* tensor = &host_tensors[i];
+    buf_ptrs.emplace_back(static_cast<const char*>(DMAHelper::base(tensor)));
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(tensor->dtype(), tensor->shape(),
+                                             &tensor_shapes[i]));
+  }
+
+  *literal = xla::BorrowingLiteral(
+      buf_ptrs, xla::ShapeUtil::MakeTupleShape(tensor_shapes));
+
+  return Status::OK();
+}
+
 Status CopyLiteralToHostTensor(const xla::LiteralSlice& literal,
                                Tensor* host_tensor) {
   TF_RET_CHECK(xla::ShapeUtil::IsArray(literal.shape()) &&
diff --git a/tensorflow/compiler/tf2xla/literal_util.h b/tensorflow/compiler/tf2xla/literal_util.h
index 220bec1553..74685025c1 100644
--- a/tensorflow/compiler/tf2xla/literal_util.h
+++ b/tensorflow/compiler/tf2xla/literal_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
 
 namespace tensorflow {
 
@@ -29,6 +30,17 @@ namespace tensorflow {
 // unsupported type.
 Status HostTensorToLiteral(const Tensor& host_tensor, xla::Literal* literal);
 
+// Returns a BorrowingLiteral that utilizes the same underlying buffer owned by
+// 'host_tensor'.
+Status HostTensorToBorrowingLiteral(const Tensor& host_tensor,
+                                    xla::BorrowingLiteral* literal);
+
+// Returns a BorrowingLiteral tuple that utilizes the same underlying buffers
+// owned by 'host_tensors'.
+Status HostTensorsToBorrowingLiteralTuple(
+    tensorflow::gtl::ArraySlice<Tensor> host_tensors,
+    xla::BorrowingLiteral* literal);
+
 // Copies 'literal' to freshly allocated 'host_tensor', which is allocated of
 // type <target_type>.
 // Fails if the literal's primitive type !=
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index f1594193af..a1da176fe3 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -19,11 +19,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 
 #include "tensorflow/compiler/tf2xla/literal_util.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/array_slice.h"
@@ -210,8 +212,9 @@ Status XlaHelpers::Iota(xla::XlaBuilder* builder, DataType dtype, int64 size,
       return errors::InvalidArgument("Invalid argument type ",
                                      DataTypeString(dtype));
   }
-  xla::Literal linspace_literal;
-  TF_RETURN_IF_ERROR(HostTensorToLiteral(linspace, &linspace_literal));
+  xla::BorrowingLiteral linspace_literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(linspace, &linspace_literal));
+
   *iota = builder->ConstantLiteral(linspace_literal);
   return Status::OK();
 }
@@ -245,8 +248,8 @@ Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64 depth, int axis,
       return errors::InvalidArgument("Invalid argument type ",
                                      DataTypeString(index_type));
   }
-  xla::Literal linspace_literal;
-  TF_RETURN_IF_ERROR(HostTensorToLiteral(linspace, &linspace_literal));
+  xla::BorrowingLiteral linspace_literal;
+  TF_RETURN_IF_ERROR(HostTensorToBorrowingLiteral(linspace, &linspace_literal));
 
   // Broadcast the linspace constant across the indices along the new axis,
   // and test equality at each position.
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 61afc311a7..6b29589700 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -2341,28 +2341,28 @@ LiteralSlice::LiteralSlice(const LiteralBase& literal,
     : LiteralBase(), root_piece_(&literal.piece(view_root)) {}
 
 BorrowingLiteral::BorrowingLiteral(const char* src_buf_ptr, const Shape& shape)
-    : LiteralBase(), shape_(shape) {
-  CHECK(ShapeUtil::IsArray(shape_));
+    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+  CHECK(ShapeUtil::IsArray(*shape_));
   CHECK_NE(src_buf_ptr, nullptr);
-  CHECK(LayoutUtil::HasLayout(shape_));
+  CHECK(LayoutUtil::HasLayout(*shape_));
 
   root_piece_ = Piece();
   root_piece_.set_buffer(const_cast<char*>(src_buf_ptr));
-  root_piece_.set_subshape(&shape_);
+  root_piece_.set_subshape(shape_.get());
 }
 
 BorrowingLiteral::BorrowingLiteral(
     tensorflow::gtl::ArraySlice<const char*> src_buf_ptrs, const Shape& shape)
-    : LiteralBase(), shape_(shape) {
-  CHECK(ShapeUtil::IsTuple(shape_));
-  CHECK(!ShapeUtil::IsNestedTuple(shape_));
-  CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(shape_));
+    : LiteralBase(), shape_(MakeUnique<Shape>(shape)) {
+  CHECK(ShapeUtil::IsTuple(*shape_));
+  CHECK(!ShapeUtil::IsNestedTuple(*shape_));
+  CHECK_EQ(src_buf_ptrs.size(), ShapeUtil::TupleElementCount(*shape_));
   root_piece_ = Piece();
-  root_piece_.set_subshape(&shape_);
-  BuildPieceSubtree(shape_, &root_piece_);
+  root_piece_.set_subshape(shape_.get());
+  BuildPieceSubtree(*shape_, &root_piece_);
 
   for (int i = 0; i < src_buf_ptrs.size(); ++i) {
-    const auto& src_shape = shape_.tuple_shapes(i);
+    const auto& src_shape = shape_->tuple_shapes(i);
     CHECK(ShapeUtil::IsArray(src_shape));
     root_piece_.child(i).set_buffer(const_cast<char*>(src_buf_ptrs[i]));
   }
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index 1e26eb7ad4..8e4159e360 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -1099,8 +1099,10 @@ class BorrowingLiteral : public LiteralBase {
   const Piece& root_piece() const override { return root_piece_; };
   Piece root_piece_;
 
-  // Shape of this literal.
-  const Shape shape_;
+  // Shape of this literal. Stored as unique_ptr so such that the (default)
+  // move construction of this class would be trivially correct: the pointer to
+  // Shape root_piece_ stores will still point to the correct address.
+  std::unique_ptr<Shape> shape_;
 };
 
 template <typename NativeT>
diff --git a/tensorflow/compiler/xla/literal_util_test.cc b/tensorflow/compiler/xla/literal_util_test.cc
index f127cee0fd..53b926163c 100644
--- a/tensorflow/compiler/xla/literal_util_test.cc
+++ b/tensorflow/compiler/xla/literal_util_test.cc
@@ -1431,7 +1431,7 @@ TEST_F(LiteralUtilTest, LiteralSliceOfALiteralSlice) {
   EXPECT_EQ(matrix_view, *Literal::CreateR2<float>({{1.0, 2.0}, {3.0, 4.0}}));
 }
 
-TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtrTest) {
+TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtr) {
   std::vector<int64> int64_values = {1, 2, 3};
   const Shape literal_shape = ShapeUtil::MakeShape(S64, {3});
 
@@ -1443,7 +1443,7 @@ TEST_F(LiteralUtilTest, BorrowingLiteralFromOneBufferPtrTest) {
   EXPECT_EQ(literal.Get<int64>({2}), 3);
 }
 
-TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrsTest) {
+TEST_F(LiteralUtilTest, BorrowingLiteralFromMultipleBufferPtrs) {
   std::vector<int64> one_two_three = {1, 2, 3};
   const Shape one_two_three_shape = ShapeUtil::MakeShape(S64, {3});
 
-- 
GitLab


From d736c6622aec39d874fe77d8b2d03a57bbdcbb78 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Thu, 7 Jun 2018 13:46:56 -0700
Subject: [PATCH 451/610] Make TOCO cmdline inputs case insensitive.

PiperOrigin-RevId: 199689105
---
 tensorflow/contrib/lite/python/tflite_convert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 492d2632fe..32ad84ec3c 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -227,17 +227,17 @@ def run_main(_):
   # Model format flags.
   parser.add_argument(
       "--output_format",
-      type=str,
+      type=str.upper,
       choices=["TFLITE", "GRAPHVIZ_DOT"],
       help="Output file format.")
   parser.add_argument(
       "--inference_type",
-      type=str,
+      type=str.upper,
       choices=["FLOAT", "QUANTIZED_UINT8"],
       help="Target data type of arrays in the output file.")
   parser.add_argument(
       "--inference_input_type",
-      type=str,
+      type=str.upper,
       choices=["FLOAT", "QUANTIZED_UINT8"],
       help=("Target data type of input arrays. Allows for a different type for "
             "input arrays in the case of quantization."))
-- 
GitLab


From e33056b35709d9f26f4a13762bc8eddd3bd3eef8 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Thu, 7 Jun 2018 14:15:13 -0700
Subject: [PATCH 452/610] Add a setuptools constraint.

---
 tensorflow/tools/pip_package/setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 78d955c637..97f625e7e9 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -54,6 +54,7 @@ REQUIRED_PACKAGES = [
     'numpy >= 1.13.3',
     'six >= 1.10.0',
     'protobuf >= 3.4.0',
+    'setuptools <= 39.1.0',
     'tensorboard >= 1.8.0, < 1.9.0',
     'termcolor >= 1.1.0',
 ]
-- 
GitLab


From a0dc8144f09da4d0597c423c2d786e206fb462ac Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 14:42:24 -0700
Subject: [PATCH 453/610] Internal change.

PiperOrigin-RevId: 199698515
---
 tensorflow/contrib/lite/kernels/internal/kernel_utils.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 6e62183975..09044193c1 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -350,7 +350,7 @@ void LstmStep(
 
     for (int b = 0; b < n_batch; ++b) {
       product_scaling_factors[b] =
-          scaling_factors[b] * input_to_cell_weights_scale;
+          scaling_factors[b] * input_to_output_weights_scale;
     }
     tensor_utils::MatrixBatchVectorMultiplyAccumulate(
         input_to_output_weights_ptr, n_cell, n_input, quantized_input_ptr_batch,
-- 
GitLab


From ae6e7c90611903591270f5221c51dca556a4759b Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Thu, 7 Jun 2018 15:02:49 -0700
Subject: [PATCH 454/610] Avoid unintentional copy of a const function when
 capturing it.

PiperOrigin-RevId: 199702086
---
 tensorflow/core/kernels/functional_ops.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index e0be57f972..519c475332 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -184,7 +184,7 @@ class IfOp : public AsyncOpKernel {
     IfOp* const kernel_;
     OpKernelContext* const ctx_;
     const bool cond_;
-    const DoneCallback done_;
+    DoneCallback done_;
     FunctionLibraryRuntime* const lib_;
     FunctionLibraryRuntime::Options opts_;
     TensorVec args_;
-- 
GitLab


From ed15a7b00f9dd0094cd784a823a65db7aef9d79c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 15:21:17 -0700
Subject: [PATCH 455/610] Fix and enable TFlite label_image_test

Resolve memory leaks from read_bmp() calls.

PiperOrigin-RevId: 199705513
---
 .../contrib/lite/examples/label_image/BUILD   | 31 +++++++++----------
 .../examples/label_image/bitmap_helpers.cc    | 28 ++++++++---------
 .../examples/label_image/bitmap_helpers.h     |  4 +--
 .../lite/examples/label_image/label_image.cc  | 12 +++----
 .../examples/label_image/label_image_test.cc  | 16 +++++-----
 5 files changed, 45 insertions(+), 46 deletions(-)

diff --git a/tensorflow/contrib/lite/examples/label_image/BUILD b/tensorflow/contrib/lite/examples/label_image/BUILD
index 9322e186a2..c61445114e 100644
--- a/tensorflow/contrib/lite/examples/label_image/BUILD
+++ b/tensorflow/contrib/lite/examples/label_image/BUILD
@@ -53,19 +53,18 @@ cc_library(
     ],
 )
 
-# TODO(ahentz): Test disabled as it has a memory leek from read_bmp
-# cc_test(
-#     name = "label_image_test",
-#     srcs = [
-#         "get_top_n.h",
-#         "get_top_n_impl.h",
-#         "label_image_test.cc",
-#     ],
-#     data = [
-#         "testdata/grace_hopper.bmp",
-#     ],
-#     deps = [
-#         ":bitmap_helpers",
-#         "//testing/base/public:gunit",
-#     ],
-# )
+cc_test(
+    name = "label_image_test",
+    srcs = [
+        "get_top_n.h",
+        "get_top_n_impl.h",
+        "label_image_test.cc",
+    ],
+    data = [
+        "testdata/grace_hopper.bmp",
+    ],
+    deps = [
+        ":bitmap_helpers",
+        "@com_google_googletest//:gtest",
+    ],
+)
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
index 0b38cd38c8..2735d1f5ea 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.cc
@@ -28,8 +28,9 @@ limitations under the License.
 namespace tflite {
 namespace label_image {
 
-uint8_t* decode_bmp(const uint8_t* input, int row_size, uint8_t* const output,
-                    int width, int height, int channels, bool top_down) {
+std::vector<uint8_t> decode_bmp(const uint8_t* input, int row_size, int width,
+                                int height, int channels, bool top_down) {
+  std::vector<uint8_t> output(height * width * channels);
   for (int i = 0; i < height; i++) {
     int src_pos;
     int dst_pos;
@@ -66,12 +67,11 @@ uint8_t* decode_bmp(const uint8_t* input, int row_size, uint8_t* const output,
       }
     }
   }
-
   return output;
 }
 
-uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
-                  int* channels, Settings* s) {
+std::vector<uint8_t> read_bmp(const std::string& input_bmp_name, int* width,
+                              int* height, int* channels, Settings* s) {
   int begin, end;
 
   std::ifstream file(input_bmp_name, std::ios::in | std::ios::binary);
@@ -87,14 +87,15 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
 
   if (s->verbose) LOG(INFO) << "len: " << len << "\n";
 
-  const uint8_t* img_bytes = new uint8_t[len];
+  std::vector<uint8_t> img_bytes(len);
   file.seekg(0, std::ios::beg);
-  file.read((char*)img_bytes, len);
+  file.read(reinterpret_cast<char*>(img_bytes.data()), len);
   const int32_t header_size =
-      *(reinterpret_cast<const int32_t*>(img_bytes + 10));
-  *width = *(reinterpret_cast<const int32_t*>(img_bytes + 18));
-  *height = *(reinterpret_cast<const int32_t*>(img_bytes + 22));
-  const int32_t bpp = *(reinterpret_cast<const int32_t*>(img_bytes + 28));
+      *(reinterpret_cast<const int32_t*>(img_bytes.data() + 10));
+  *width = *(reinterpret_cast<const int32_t*>(img_bytes.data() + 18));
+  *height = *(reinterpret_cast<const int32_t*>(img_bytes.data() + 22));
+  const int32_t bpp =
+      *(reinterpret_cast<const int32_t*>(img_bytes.data() + 28));
   *channels = bpp / 8;
 
   if (s->verbose)
@@ -110,10 +111,9 @@ uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
   bool top_down = (*height < 0);
 
   // Decode image, allocating tensor once the image size is known
-  uint8_t* output = new uint8_t[abs(*height) * *width * *channels];
   const uint8_t* bmp_pixels = &img_bytes[header_size];
-  return decode_bmp(bmp_pixels, row_size, output, *width, abs(*height),
-                    *channels, top_down);
+  return decode_bmp(bmp_pixels, row_size, *width, abs(*height), *channels,
+                    top_down);
 }
 
 }  // namespace label_image
diff --git a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
index 97343dde6b..5fc75b1f72 100644
--- a/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
+++ b/tensorflow/contrib/lite/examples/label_image/bitmap_helpers.h
@@ -22,8 +22,8 @@ limitations under the License.
 namespace tflite {
 namespace label_image {
 
-uint8_t* read_bmp(const std::string& input_bmp_name, int* width, int* height,
-                  int* channels, Settings* s);
+std::vector<uint8_t> read_bmp(const std::string& input_bmp_name, int* width,
+                              int* height, int* channels, Settings* s);
 
 template <class T>
 void resize(T* out, uint8_t* in, int image_height, int image_width,
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image.cc b/tensorflow/contrib/lite/examples/label_image/label_image.cc
index 966fcd2a31..86d7d1cc4a 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image.cc
@@ -138,8 +138,8 @@ void RunInference(Settings* s) {
   int image_width = 224;
   int image_height = 224;
   int image_channels = 3;
-  uint8_t* in = read_bmp(s->input_bmp_name, &image_width, &image_height,
-                         &image_channels, s);
+  std::vector<uint8_t> in = read_bmp(s->input_bmp_name, &image_width,
+                                     &image_height, &image_channels, s);
 
   int input = interpreter->inputs()[0];
   if (s->verbose) LOG(INFO) << "input: " << input << "\n";
@@ -168,12 +168,12 @@ void RunInference(Settings* s) {
   switch (interpreter->tensor(input)->type) {
     case kTfLiteFloat32:
       s->input_floating = true;
-      resize<float>(interpreter->typed_tensor<float>(input), in, image_height,
-                    image_width, image_channels, wanted_height, wanted_width,
-                    wanted_channels, s);
+      resize<float>(interpreter->typed_tensor<float>(input), in.data(),
+                    image_height, image_width, image_channels, wanted_height,
+                    wanted_width, wanted_channels, s);
       break;
     case kTfLiteUInt8:
-      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in,
+      resize<uint8_t>(interpreter->typed_tensor<uint8_t>(input), in.data(),
                       image_height, image_width, image_channels, wanted_height,
                       wanted_width, wanted_channels, s);
       break;
diff --git a/tensorflow/contrib/lite/examples/label_image/label_image_test.cc b/tensorflow/contrib/lite/examples/label_image/label_image_test.cc
index ce35483f76..de7de21f77 100644
--- a/tensorflow/contrib/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/contrib/lite/examples/label_image/label_image_test.cc
@@ -27,20 +27,20 @@ namespace label_image {
 
 TEST(LabelImageTest, GraceHopper) {
   std::string lena_file =
-      "tensorflow/contrib/lite/examples/label_image/testdata/grace_hopper.bmp";
+      "tensorflow/contrib/lite/examples/label_image/testdata/"
+      "grace_hopper.bmp";
   int height, width, channels;
   Settings s;
-  uint8_t *data;
-
-  data = read_bmp(lena_file, &width, &height, &channels, &s);
+  std::vector<uint8_t> input =
+      read_bmp(lena_file, &width, &height, &channels, &s);
   ASSERT_EQ(height, 606);
   ASSERT_EQ(width, 517);
   ASSERT_EQ(channels, 3);
 
-  uint8_t *out = new uint8_t[606 * 517 * 3];
-  downsize<uint8_t>(out, data, 606, 517, 3, 214, 214, 3, &s);
-  ASSERT_EQ(out[0], 0x15);
-  ASSERT_EQ(out[214 * 214 * 3 - 1], 0x12);
+  std::vector<uint8_t> output(606 * 517 * 3);
+  resize<uint8_t>(output.data(), input.data(), 606, 517, 3, 214, 214, 3, &s);
+  ASSERT_EQ(output[0], 0x15);
+  ASSERT_EQ(output[214 * 214 * 3 - 1], 0x11);
 }
 
 TEST(LabelImageTest, GetTopN) {
-- 
GitLab


From 9f640dc874dba2e10b634cb7e87837f040fa83dc Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 15:21:40 -0700
Subject: [PATCH 456/610] [TF:XLA] Fix invalid HLO graph in
 hlo_rematerialization_test.

The shape of the while-init did not match the body computation parameter's shape.

Also, invoke the HLO verifier in the test to verify shapes.

PiperOrigin-RevId: 199705580
---
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../xla/service/hlo_rematerialization_test.cc | 122 +++++++++---------
 2 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 89de302f4d..29718e057b 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2139,6 +2139,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index 83de54f3fa..e81334d5a8 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -40,7 +41,8 @@ class HloRematerializationTest : public HloTestBase {
   // Creates and returns a computation which can benefit from
   // rematerialization. The computation looks like:
   //
-  //   F32[] %param = {...}
+  //   F32[1] %param = {...}
+  //   F32[] %reshape = reshape(F32[], param)
   //   F32[1024] %bcast = broadcast(%param)
   //   F32[1024] %negate = negate(%bcast)
   //   F32[2048] %concat_1 = concat({%negate, %negate})
@@ -57,9 +59,11 @@ class HloRematerializationTest : public HloTestBase {
       const string& suffix = "") {
     auto builder = HloComputation::Builder(TestName() + suffix);
     auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+    auto reshape = builder.AddInstruction(
+        HloInstruction::CreateReshape(scalar_shape_, param));
     auto bcast = builder.AddInstruction(
-        HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+        HloInstruction::CreateBroadcast(vec1024_shape_, reshape, {}));
     auto negate = builder.AddInstruction(
         HloInstruction::CreateUnary(vec1024_shape_, HloOpcode::kNegate, bcast));
     auto concat_1 = builder.AddInstruction(HloInstruction::CreateConcatenate(
@@ -100,9 +104,11 @@ class HloRematerializationTest : public HloTestBase {
       const string& suffix = "") {
     auto builder = HloComputation::Builder(TestName() + suffix);
     auto param = builder.AddInstruction(
-        HloInstruction::CreateParameter(0, scalar_shape_, "param"));
+        HloInstruction::CreateParameter(0, vec1_shape_, "param"));
+    auto reshape = builder.AddInstruction(
+        HloInstruction::CreateReshape(scalar_shape_, param));
     auto bcast = builder.AddInstruction(
-        HloInstruction::CreateBroadcast(vec1024_shape_, param, {}));
+        HloInstruction::CreateBroadcast(vec1024_shape_, reshape, {}));
     auto slice_1 = builder.AddInstruction(
         HloInstruction::CreateSlice(vec1_shape_, bcast, /*start_indices=*/{0},
                                     /*limit_indices=*/{1},
@@ -135,6 +141,15 @@ class HloRematerializationTest : public HloTestBase {
     return ShapeUtil::ByteSizeOf(shape, sizeof(void*));
   }
 
+  StatusOr<bool> RunHloRematerialization(
+      int64 memory_limit_bytes, HloModule* module,
+      SequentialHloOrdering::HloModuleSequence* sequence) {
+    TF_EXPECT_OK(verifier().Run(module).status());
+    return HloRematerialization::RematerializeAndSchedule(
+        ByteSizeOf, memory_limit_bytes, module, DefaultMemoryScheduler,
+        sequence);
+  }
+
   // Various shapes used in the canned computations.
   const Shape scalar_shape_ = ShapeUtil::MakeShape(xla::F32, {});
   const Shape vec1_shape_ = ShapeUtil::MakeShape(xla::F32, {1});
@@ -158,11 +173,9 @@ TEST_F(HloRematerializationTest, SingleComputation) {
   SequentialHloOrdering::HloModuleSequence sequence;
   // Computation requires 16KB without rematerialization, but uses only 12KB
   // with rematerialization so pick a memory limit between these values (14KB).
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/14 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/14 * 1024,
+                                            module.get(), &sequence));
   EXPECT_TRUE(changed);
 
   // Root should not have changed.
@@ -188,18 +201,16 @@ TEST_F(HloRematerializationTest, SingleComputationNoRematerialization) {
   HloComputation* computation =
       module->AddEntryComputation(MakeRematerializableComputation());
 
-  EXPECT_EQ(computation->instruction_count(), 7);
+  EXPECT_EQ(computation->instruction_count(), 8);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/20 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/20 * 1024,
+                                            module.get(), &sequence));
 
   // No instructions should have been materialized.
   EXPECT_FALSE(changed);
-  EXPECT_EQ(computation->instruction_count(), 7);
+  EXPECT_EQ(computation->instruction_count(), 8);
 }
 
 // Test rematerialization of a computation which calls another computation via a
@@ -225,23 +236,21 @@ TEST_F(HloRematerializationTest, RematerializeAroundWhile) {
       module->AddEntryComputation(MakeRematerializableWhileComputation(
           while_cond, /*while_body=*/body_computation));
 
-  EXPECT_EQ(entry_computation->instruction_count(), 6);
-  EXPECT_EQ(body_computation->instruction_count(), 7);
+  EXPECT_EQ(entry_computation->instruction_count(), 7);
+  EXPECT_EQ(body_computation->instruction_count(), 8);
 
   // The body computation uses 16KB and the entry computation uses 2KB at the
   // while so the peak memory use of the module is 18KB. Set the memory limit a
   // bit lower (17KB) to force rematerialization of the entry computation.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/17 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/17 * 1024,
+                                            module.get(), &sequence));
   EXPECT_TRUE(changed);
 
   // Only the entry computation should have a rematerialized instruction added.
-  EXPECT_EQ(entry_computation->instruction_count(), 7);
-  EXPECT_EQ(body_computation->instruction_count(), 7);
+  EXPECT_EQ(entry_computation->instruction_count(), 8);
+  EXPECT_EQ(body_computation->instruction_count(), 8);
 }
 
 // Test rematerialization of a computation which calls another computation via a
@@ -264,20 +273,18 @@ TEST_F(HloRematerializationTest, RematerializeEntryAndWhileBody) {
       module->AddEntryComputation(MakeRematerializableWhileComputation(
           while_cond, /*while_body=*/body_computation));
 
-  EXPECT_EQ(entry_computation->instruction_count(), 6);
-  EXPECT_EQ(body_computation->instruction_count(), 7);
+  EXPECT_EQ(entry_computation->instruction_count(), 7);
+  EXPECT_EQ(body_computation->instruction_count(), 8);
 
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/15 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/15 * 1024,
+                                            module.get(), &sequence));
   EXPECT_TRUE(changed);
 
-  // Both computations should have a rematerialized instruction added.
-  EXPECT_EQ(entry_computation->instruction_count(), 7);
-  EXPECT_EQ(body_computation->instruction_count(), 8);
+  // Both computations should have rematerialized instructions added.
+  EXPECT_EQ(entry_computation->instruction_count(), 9);
+  EXPECT_EQ(body_computation->instruction_count(), 9);
 }
 
 // Test rematerialization of a doubly nested computation. All computations
@@ -303,24 +310,22 @@ TEST_F(HloRematerializationTest, RematerializeNestedComputations) {
       module->AddEntryComputation(MakeRematerializableWhileComputation(
           while_cond, /*while_body=*/middle_computation));
 
-  EXPECT_EQ(entry_computation->instruction_count(), 6);
-  EXPECT_EQ(middle_computation->instruction_count(), 6);
-  EXPECT_EQ(inner_computation->instruction_count(), 7);
+  EXPECT_EQ(entry_computation->instruction_count(), 7);
+  EXPECT_EQ(middle_computation->instruction_count(), 7);
+  EXPECT_EQ(inner_computation->instruction_count(), 8);
 
   // If all computations are maximally rematerialized then peak memory usage is
   // ~12K so pick something slightly larger.
   SequentialHloOrdering::HloModuleSequence sequence;
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/13 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/13 * 1024,
+                                            module.get(), &sequence));
   EXPECT_TRUE(changed);
 
-  // All computations should have a rematerialized instruction added.
-  EXPECT_EQ(entry_computation->instruction_count(), 7);
-  EXPECT_EQ(middle_computation->instruction_count(), 7);
-  EXPECT_EQ(inner_computation->instruction_count(), 8);
+  // All computations should have rematerialized instructions added.
+  EXPECT_EQ(entry_computation->instruction_count(), 9);
+  EXPECT_EQ(middle_computation->instruction_count(), 9);
+  EXPECT_EQ(inner_computation->instruction_count(), 9);
 }
 
 TEST_F(HloRematerializationTest, RngNotRematerialized) {
@@ -382,10 +387,9 @@ TEST_F(HloRematerializationTest, RngNotRematerialized) {
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
   TF_ASSERT_OK_AND_ASSIGN(
-      bool changed, HloRematerialization::RematerializeAndSchedule(
-                        ByteSizeOf,
+      bool changed, RunHloRematerialization(
                         /*memory_limit_bytes=*/4 * ByteSizeOf(vec1024_shape_),
-                        module.get(), DefaultMemoryScheduler, &sequence));
+                        module.get(), &sequence));
   EXPECT_TRUE(changed);
   // The rng should not have been rematerialized.
   EXPECT_EQ(count_rngs(entry_computation), 1);
@@ -476,11 +480,9 @@ TEST_F(HloRematerializationTest, InstructionRematerializedMultipleTimes) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/22 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/22 * 1024,
+                                            module.get(), &sequence));
   EXPECT_TRUE(changed);
 
   // The broadcast should have been rematerialized 3 times.
@@ -573,11 +575,9 @@ TEST_P(IndirectUseTest, IndirectUseNotRematerialized) {
   // Pick a memory limit some where between 24KB (initial peak memory including
   // parameter and output) and 20KB (peak memory possible with
   // rematerialization).
-  TF_ASSERT_OK_AND_ASSIGN(bool changed,
-                          HloRematerialization::RematerializeAndSchedule(
-                              ByteSizeOf,
-                              /*memory_limit_bytes=*/22 * 1024, module.get(),
-                              DefaultMemoryScheduler, &sequence));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHloRematerialization(
+                                            /*memory_limit_bytes=*/22 * 1024,
+                                            module.get(), &sequence));
   // Rematerialization should only occur if the rematerializable instruction has
   // no indirect uses.
   if (indirectly_used) {
-- 
GitLab


From e73c66f8152690b9f2466bfcca887283ed380980 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 15:28:16 -0700
Subject: [PATCH 457/610] Add ScaleTriL Bijector to enable transformed
 distributions over PSD matrices.

PiperOrigin-RevId: 199706732
---
 tensorflow/contrib/distributions/BUILD        |  19 +++
 .../kernel_tests/bijectors/scale_tril_test.py |  69 +++++++++++
 .../python/ops/bijectors/__init__.py          |   2 +
 .../python/ops/bijectors/scale_tril.py        | 114 ++++++++++++++++++
 4 files changed, 204 insertions(+)
 create mode 100644 tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py
 create mode 100644 tensorflow/contrib/distributions/python/ops/bijectors/scale_tril.py

diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 61d4e90ea2..51f7028566 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -1137,6 +1137,25 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "scale_tril_test",
+    size = "small",
+    srcs = ["python/kernel_tests/bijectors/scale_tril_test.py"],
+    additional_deps = [
+        ":bijectors_py",
+        ":distributions_py",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
+        "//tensorflow/contrib/linalg:linalg_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "sigmoid_test",
     size = "small",
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py
new file mode 100644
index 0000000000..566a7b3dff
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/scale_tril_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ScaleTriL bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.distributions.python.ops import bijectors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+class ScaleTriLBijectorTest(test.TestCase):
+  """Tests the correctness of the ScaleTriL bijector."""
+
+  def setUp(self):
+    self._rng = np.random.RandomState(42)
+
+  def testComputesCorrectValues(self):
+    shift = 1.61803398875
+    x = np.float32(np.array([-1, .5, 2]))
+    y = np.float32(np.array([[np.exp(2) + shift, 0.],
+                             [.5, np.exp(-1) + shift]]))
+
+    b = bijectors.ScaleTriL(diag_bijector=bijectors.Exp(),
+                            diag_shift=shift)
+
+    y_ = self.evaluate(b.forward(x))
+    self.assertAllClose(y, y_)
+
+    x_ = self.evaluate(b.inverse(y))
+    self.assertAllClose(x, x_)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testInvertible(self):
+
+    # Generate random inputs from an unconstrained space, with
+    # event size 6 to specify 3x3 triangular matrices.
+    batch_shape = [2, 1]
+    x = np.float32(np.random.randn(*(batch_shape + [6])))
+    b = bijectors.ScaleTriL(diag_bijector=bijectors.Softplus(),
+                            diag_shift=3.14159)
+    y = self.evaluate(b.forward(x))
+    self.assertAllEqual(y.shape, batch_shape + [3, 3])
+
+    x_ = self.evaluate(b.inverse(y))
+    self.assertAllClose(x, x_)
+
+    fldj = self.evaluate(b.forward_log_det_jacobian(x, event_ndims=1))
+    ildj = self.evaluate(b.inverse_log_det_jacobian(y, event_ndims=2))
+    self.assertAllClose(fldj, -ildj)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
index d97a1f0d30..e141f8b5c6 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/__init__.py
@@ -37,6 +37,7 @@
 @@PowerTransform
 @@RealNVP
 @@Reshape
+@@ScaleTriL
 @@Sigmoid
 @@SinhArcsinh
 @@SoftmaxCentered
@@ -78,6 +79,7 @@ from tensorflow.contrib.distributions.python.ops.bijectors.permute import *
 from tensorflow.contrib.distributions.python.ops.bijectors.power_transform import *
 from tensorflow.contrib.distributions.python.ops.bijectors.real_nvp import *
 from tensorflow.contrib.distributions.python.ops.bijectors.reshape import *
+from tensorflow.contrib.distributions.python.ops.bijectors.scale_tril import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sigmoid import *
 from tensorflow.contrib.distributions.python.ops.bijectors.sinh_arcsinh import *
 from tensorflow.contrib.distributions.python.ops.bijectors.softmax_centered import *
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/scale_tril.py b/tensorflow/contrib/distributions/python/ops/bijectors/scale_tril.py
new file mode 100644
index 0000000000..96bd242c63
--- /dev/null
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/scale_tril.py
@@ -0,0 +1,114 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""ScaleTriL bijector."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.distributions.python.ops.bijectors import affine_scalar
+from tensorflow.contrib.distributions.python.ops.bijectors import chain
+from tensorflow.contrib.distributions.python.ops.bijectors import fill_triangular
+from tensorflow.contrib.distributions.python.ops.bijectors import softplus
+from tensorflow.contrib.distributions.python.ops.bijectors import transform_diagonal
+
+__all__ = [
+    "ScaleTriL",
+]
+
+
+class ScaleTriL(chain.Chain):
+  """Transforms unconstrained vectors to TriL matrices with positive diagonal.
+
+  This is implemented as a simple `tfb.Chain` of `tfb.FillTriangular`
+  followed by `tfb.TransformDiagonal`, and provided mostly as a
+  convenience. The default setup is somewhat opinionated, using a
+  Softplus transformation followed by a small shift (`1e-5`) which
+  attempts to avoid numerical issues from zeros on the diagonal.
+
+  #### Examples
+
+  ```python
+  tfb = tf.contrib.distributions.bijectors
+  b = tfb.ScaleTriL(
+       diag_bijector=tfb.Exp(),
+       diag_shift=None)
+  b.forward(x=[0., 0., 0.])
+  # Result: [[1., 0.],
+  #          [0., 1.]]
+  b.inverse(y=[[1., 0],
+               [.5, 2]])
+  # Result: [log(2), .5, log(1)]
+
+  # Define a distribution over PSD matrices of shape `[3, 3]`,
+  # with `1 + 2 + 3 = 6` degrees of freedom.
+  dist = tfd.TransformedDistribution(
+          tfd.Normal(tf.zeros(6), tf.ones(6)),
+          tfb.Chain([tfb.CholeskyOuterProduct(), tfb.ScaleTriL()]))
+
+  # Using an identity transformation, ScaleTriL is equivalent to
+  # tfb.FillTriangular.
+  b = tfb.ScaleTriL(
+       diag_bijector=tfb.Identity(),
+       diag_shift=None)
+
+  # For greater control over initialization, one can manually encode
+  # pre- and post- shifts inside of `diag_bijector`.
+  b = tfb.ScaleTriL(
+       diag_bijector=tfb.Chain([
+         tfb.AffineScalar(shift=1e-3),
+         tfb.Softplus(),
+         tfb.AffineScalar(shift=0.5413)]),  # softplus_inverse(1.)
+                                            #  = log(expm1(1.)) = 0.5413
+       diag_shift=None)
+  ```
+  """
+
+  def __init__(self,
+               diag_bijector=None,
+               diag_shift=1e-5,
+               validate_args=False,
+               name="scale_tril"):
+    """Instantiates the `ScaleTriL` bijector.
+
+    Args:
+      diag_bijector: `Bijector` instance, used to transform the output diagonal
+        to be positive.
+        Default value: `None` (i.e., `tfb.Softplus()`).
+      diag_shift: Float value broadcastable and added to all diagonal entries
+        after applying the `diag_bijector`. Setting a positive
+        value forces the output diagonal entries to be positive, but
+        prevents inverting the transformation for matrices with
+        diagonal entries less than this value.
+        Default value: `1e-5` (i.e., no shift is applied).
+      validate_args: Python `bool` indicating whether arguments should be
+        checked for correctness.
+        Default value: `False` (i.e., arguments are not validated).
+      name: Python `str` name given to ops managed by this object.
+        Default value: `scale_tril`.
+    """
+
+    if diag_bijector is None:
+      diag_bijector = softplus.Softplus(validate_args=validate_args)
+
+    if diag_shift is not None:
+      diag_bijector = chain.Chain([affine_scalar.AffineScalar(shift=diag_shift),
+                                   diag_bijector])
+
+    super(ScaleTriL, self).__init__(
+        [transform_diagonal.TransformDiagonal(diag_bijector=diag_bijector),
+         fill_triangular.FillTriangular()],
+        validate_args=validate_args,
+        name=name)
-- 
GitLab


From 5ad9d9cb933864e5eb938c31551d5ba861ced0f2 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 16:02:37 -0700
Subject: [PATCH 458/610] Split out HloFftInstruction and
 HloSendRecvInstruction as subclasses from HloInstruction.

PiperOrigin-RevId: 199712253
---
 .../compiler/xla/service/hlo_instruction.cc   | 154 ++++++++----------
 .../compiler/xla/service/hlo_instruction.h    |  62 +++----
 .../compiler/xla/service/hlo_instructions.cc  | 150 ++++++++++++++++-
 .../compiler/xla/service/hlo_instructions.h   | 145 ++++++++++++++---
 4 files changed, 358 insertions(+), 153 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 8d7604fae1..cf1530abe1 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -86,6 +86,31 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                         operands(2), operands(3), operands(4),
                                         proto.epsilon(), proto.feature_index());
       break;
+    case HloOpcode::kFft: {
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      std::vector<int64> fft_length(proto.fft_length().begin(),
+                                    proto.fft_length().end());
+      instruction = CreateFft(proto.shape(), operands(0), proto.fft_type(),
+                              tensorflow::gtl::ArraySlice<int64>(fft_length));
+      break;
+    }
+    case HloOpcode::kSend:
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      instruction = CreateSend(operands(0), proto.channel_id());
+      break;
+    case HloOpcode::kSendDone:
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      instruction = CreateSendDone(operands(0));
+      break;
+    case HloOpcode::kRecv:
+      CHECK_EQ(proto.operand_ids_size(), 0);
+      instruction =
+          CreateRecv(proto.shape().tuple_shapes(0), proto.channel_id());
+      break;
+    case HloOpcode::kRecvDone:
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      instruction = CreateRecvDone(operands(0));
+      break;
     default: {
       instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -181,14 +206,9 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   }
   instruction->outfeed_config_ = proto.outfeed_config();
   instruction->distribution_ = proto.distribution();
-  instruction->channel_id_ = proto.channel_id();
   instruction->infeed_config_ = proto.infeed_config();
   instruction->custom_call_target_ = proto.custom_call_target();
   instruction->outfeed_shape_ = proto.outfeed_shape();
-  instruction->fft_type_ = proto.fft_type();
-  for (int64 fft_len : proto.fft_length()) {
-    instruction->fft_length_.push_back(fft_len);
-  }
 
   if (proto.has_sharding()) {
     TF_ASSIGN_OR_RETURN(const auto& sharding,
@@ -404,11 +424,7 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFft(
     const Shape& shape, HloInstruction* operand, FftType fft_type,
     tensorflow::gtl::ArraySlice<int64> fft_length) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kFft, shape));
-  instruction->AppendOperand(operand);
-  instruction->fft_type_ = fft_type;
-  instruction->fft_length_.assign(fft_length.begin(), fft_length.end());
-  return instruction;
+  return MakeUnique<HloFftInstruction>(shape, operand, fft_type, fft_length);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDot(
@@ -490,48 +506,28 @@ HloInstruction::CreateCrossReplicaSum(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSend(
     HloInstruction* operand, int64 channel_id) {
-  // Send instruction produces a tuple of {aliased operand, U32 context}.
-  Shape output_shape = ShapeUtil::MakeTupleShape(
-      {operand->shape(), ShapeUtil::MakeShape(U32, {})});
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kSend, output_shape));
-  instruction->AppendOperand(operand);
-  instruction->channel_id_ = channel_id;
-  return instruction;
+  return MakeUnique<HloSendInstruction>(operand, channel_id);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateSendDone(
     HloInstruction* operand) {
-  CHECK(operand->opcode() == HloOpcode::kSend)
+  auto send_operand = DynCast<HloSendInstruction>(operand);
+  CHECK(send_operand != nullptr)
       << "SendDone must take the context operand from Send";
-  auto instruction = WrapUnique(
-      new HloInstruction(HloOpcode::kSendDone, ShapeUtil::MakeNil()));
-  instruction->AppendOperand(operand);
-  instruction->channel_id_ = operand->channel_id();
-  return instruction;
+  return MakeUnique<HloSendDoneInstruction>(send_operand);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecv(
     const Shape& shape, int64 channel_id) {
-  // Recv instruction produces a tuple of {receive buffer, U32 context}.
-  Shape output_shape =
-      ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})});
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kRecv, output_shape));
-  instruction->channel_id_ = channel_id;
-  return instruction;
+  return MakeUnique<HloRecvInstruction>(shape, channel_id);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecvDone(
     HloInstruction* operand) {
-  CHECK(operand->opcode() == HloOpcode::kRecv)
+  auto recv_operand = DynCast<HloRecvInstruction>(operand);
+  CHECK(recv_operand != nullptr)
       << "RecvDone must take the context operand from Recv";
-  Shape output_shape = ShapeUtil::GetTupleElementShape(operand->shape(), 0);
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kRecvDone, output_shape));
-  instruction->AppendOperand(operand);
-  instruction->channel_id_ = operand->channel_id();
-  return instruction;
+  return MakeUnique<HloRecvDoneInstruction>(recv_operand);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReverse(
@@ -674,8 +670,8 @@ HloInstruction::CreateBatchNormTraining(const Shape& shape,
                                         HloInstruction* scale,
                                         HloInstruction* offset, float epsilon,
                                         int64 feature_index) {
-  return WrapUnique<HloInstruction>(new HloBatchNormTrainingInstruction(
-      shape, operand, scale, offset, epsilon, feature_index));
+  return MakeUnique<HloBatchNormTrainingInstruction>(
+      shape, operand, scale, offset, epsilon, feature_index);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -683,8 +679,8 @@ HloInstruction::CreateBatchNormInference(
     const Shape& shape, HloInstruction* operand, HloInstruction* scale,
     HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
     float epsilon, int64 feature_index) {
-  return WrapUnique<HloInstruction>(new HloBatchNormInferenceInstruction(
-      shape, operand, scale, offset, mean, variance, epsilon, feature_index));
+  return MakeUnique<HloBatchNormInferenceInstruction>(
+      shape, operand, scale, offset, mean, variance, epsilon, feature_index);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -693,9 +689,9 @@ HloInstruction::CreateBatchNormGrad(const Shape& shape, HloInstruction* operand,
                                     HloInstruction* variance,
                                     HloInstruction* grad_output, float epsilon,
                                     int64 feature_index) {
-  return WrapUnique<HloInstruction>(
-      new HloBatchNormGradInstruction(shape, operand, scale, mean, variance,
-                                      grad_output, epsilon, feature_index));
+  return MakeUnique<HloBatchNormGradInstruction>(shape, operand, scale, mean,
+                                                 variance, grad_output, epsilon,
+                                                 feature_index);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1287,6 +1283,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kFft:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1395,10 +1396,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateDot(shape, new_operands[0], new_operands[1],
                         *dot_dimension_numbers_);
       break;
-    case HloOpcode::kFft:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateFft(shape, new_operands[0], fft_type_, fft_length_);
-      break;
     case HloOpcode::kCrossReplicaSum:
       clone = CreateCrossReplicaSum(shape, new_operands, to_apply());
       break;
@@ -1504,24 +1501,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                                 true_computation(), new_operands[2],
                                 false_computation());
       break;
-    case HloOpcode::kSend:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateSend(new_operands[0], channel_id());
-      break;
-    case HloOpcode::kSendDone:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateSendDone(new_operands[0]);
-      break;
-    case HloOpcode::kRecv:
-      CHECK_EQ(new_operands.size(), 0);
-      // The shape is a tuple, but CreateRecv() wants the raw data shape.
-      clone =
-          CreateRecv(ShapeUtil::GetTupleElementShape(shape, 0), channel_id());
-      break;
-    case HloOpcode::kRecvDone:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateRecvDone(new_operands[0]);
-      break;
     case HloOpcode::kGather:
       CHECK_EQ(new_operands.size(), 2);
       clone = CreateGather(shape, new_operands[0], new_operands[1],
@@ -1855,11 +1834,6 @@ bool HloInstruction::IdenticalSlowPath(
                                            other.gather_dimension_numbers()) &&
              gather_window_bounds() == other.gather_window_bounds();
 
-    // FFT has various types & lengths.
-    case HloOpcode::kFft:
-      return fft_type() == other.fft_type() &&
-             fft_length() == other.fft_length();
-
     // Reduction results are determined by the reduction dimension and the
     // reduction computation.
     case HloOpcode::kReduce:
@@ -1915,10 +1889,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kInfeed:
     case HloOpcode::kOutfeed:
     case HloOpcode::kSort:
-    case HloOpcode::kRecv:
-    case HloOpcode::kRecvDone:
-    case HloOpcode::kSend:
-    case HloOpcode::kSendDone:
     case HloOpcode::kHostCompute:
       return false;
 
@@ -1927,6 +1897,11 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kBatchNormTraining:
     case HloOpcode::kBatchNormInference:
     case HloOpcode::kBatchNormGrad:
+    case HloOpcode::kFft:
+    case HloOpcode::kSend:
+    case HloOpcode::kSendDone:
+    case HloOpcode::kRecv:
+    case HloOpcode::kRecvDone:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -2292,7 +2267,8 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
 
 std::vector<string> HloInstruction::ExtraAttributesToString(
     const HloPrintOptions& options) const {
-  std::vector<string> extra;
+  std::vector<string> extra = ExtraAttributesToStringImpl(options);
+
   if (opcode() == HloOpcode::kFusion) {
     extra.push_back(StrCat("kind=", xla::ToString(fusion_kind())));
   }
@@ -2337,10 +2313,6 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(
         StrCat("window_bounds={", Join(gather_window_bounds(), ","), "}"));
   }
-  if (opcode() == HloOpcode::kFft) {
-    extra.push_back(StrCat("fft_type=", FftType_Name(fft_type())));
-    extra.push_back(StrCat("fft_length={", Join(fft_length(), ","), "}"));
-  }
 
   if (options.print_subcomputation_mode() ==
       HloPrintOptions::PrintSubcomputationMode::kNameOnly) {
@@ -2411,10 +2383,6 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
         break;
     }
   }
-  if (opcode() == HloOpcode::kSend || opcode() == HloOpcode::kRecv ||
-      opcode() == HloOpcode::kSendDone || opcode() == HloOpcode::kRecvDone) {
-    extra.push_back(StrCat("channel_id=", channel_id_));
-  }
 
   if (opcode() == HloOpcode::kGetTupleElement) {
     extra.push_back(StrCat("index=", tuple_index()));
@@ -2543,14 +2511,9 @@ HloInstructionProto HloInstruction::ToProto() const {
   if (opcode() == HloOpcode::kRng) {
     proto.set_distribution(distribution_);
   }
-  proto.set_channel_id(channel_id_);
   proto.set_infeed_config(infeed_config_);
   proto.set_custom_call_target(custom_call_target_);
   *proto.mutable_outfeed_shape() = outfeed_shape_;
-  proto.set_fft_type(fft_type_);
-  for (int64 fft_len : fft_length_) {
-    proto.add_fft_length(fft_len);
-  }
 
   if (has_sharding()) {
     *proto.mutable_sharding() = sharding().ToProto();
@@ -3617,4 +3580,15 @@ float HloInstruction::epsilon() const {
   return Cast<HloBatchNormInstruction>(this)->epsilon();
 }
 
+FftType HloInstruction::fft_type() const {
+  return Cast<HloFftInstruction>(this)->fft_type();
+}
+
+const std::vector<int64>& HloInstruction::fft_length() const {
+  return Cast<HloFftInstruction>(this)->fft_length();
+}
+
+int64 HloInstruction::channel_id() const {
+  return Cast<HloSendRecvInstruction>(this)->channel_id();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index b16837eaec..6232d55e1b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -992,7 +992,7 @@ class HloInstruction {
   string OperandsToString(const HloPrintOptions& options) const;
 
   // Returns string representation of op-specific attributes.
-  virtual std::vector<string> ExtraAttributesToString(
+  std::vector<string> ExtraAttributesToString(
       const HloPrintOptions& options) const;
 
   // As ToString, but returns a shorter string.
@@ -1011,27 +1011,12 @@ class HloInstruction {
   HloInstruction* tracing() const;
   void set_tracing(HloInstruction* trace_instruction);
 
-  // Returns the channel id associated with the instruction. The id is
-  // shared between each Send/Recv pair and is globally unique to identify each
-  // channel.
-  //
-  // Precondition: opcode() == HloOpcode::kSend or HloOpcode::kRecv
-  int64 channel_id() const { return channel_id_; }
-
   // Returns the channel name associated with the instruction. The name is
   // used to identify host Send/Recv operations.
   //
   // Precondition: opcode() == HloOpcode::kHostCompute
   string channel_name() const { return channel_name_; }
 
-  // Delegates to HloBatchNormInstruction::feature_index.
-  // TODO(b/80131774): Remove this code.
-  int64 feature_index() const;
-
-  // Delegates to HloBatchNormInstruction::epsilon.
-  // TODO(b/80131774): Remove this code.
-  float epsilon() const;
-
   // Returns the infeed configuration string. The infeed configuration includes
   // any metadata needed for the backend compiler (e.g., infeed buffer address)
   // and is target-dependent.
@@ -1318,16 +1303,6 @@ class HloInstruction {
         MakeUnique<ConvolutionDimensionNumbers>(dnums);
   }
 
-  FftType fft_type() const {
-    CHECK_EQ(HloOpcode::kFft, opcode_);
-    return fft_type_;
-  }
-
-  const std::vector<int64>& fft_length() const {
-    CHECK_EQ(HloOpcode::kFft, opcode_);
-    return fft_length_;
-  }
-
   // Returns data on the dimension numbers used for a dot operation.
   const DotDimensionNumbers& dot_dimension_numbers() const {
     CHECK(dot_dimension_numbers_ != nullptr);
@@ -1526,6 +1501,25 @@ class HloInstruction {
   void RelayoutConstant(const Layout& new_layout,
                         const ShapeIndex& shape_index = {});
 
+  // Old methods kept for smooth subclassing transition BEGIN.
+  // TODO(b/80131774): Remove this code.
+
+  // Delegates to HloBatchNormInstruction::feature_index.
+  int64 feature_index() const;
+
+  // Delegates to HloBatchNormInstruction::epsilon.
+  float epsilon() const;
+
+  // Delegates to HloFftInstruction::fft_type.
+  FftType fft_type() const;
+
+  // Delegates to HloFftInstruction::fft_length.
+  const std::vector<int64>& fft_length() const;
+
+  // Delegates to HloSendRecvInstruction::channel_id.
+  int64 channel_id() const;
+  // Old methods kept for smooth subclassing transition END.
+
  protected:
   // Internal constructor for a given opcode/shape, other fields must be filled
   // by factory methods.
@@ -1544,6 +1538,12 @@ class HloInstruction {
     // TODO(b/80131774): This should be pure virtual.
     LOG(FATAL) << "Unimplemented method.";
   }
+
+  // Implementation for non-common logic of ExtraAttributesToString.
+  virtual std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const {
+    return {};
+  }
   // Prints an instruction to a string.
   //
   // The canonical string representation needs to name operands and instruction
@@ -1675,12 +1675,6 @@ class HloInstruction {
   std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers_;
   std::vector<int64> gather_window_bounds_;
 
-  // Describes FFT type for an FFT instruction.
-  FftType fft_type_ = FftType::FFT;
-
-  // Indicates the FFT length for an FFT instruction.
-  std::vector<int64> fft_length_;
-
   // Describes the [begin, end) index range for a slice.
   std::vector<int64> slice_starts_;
   std::vector<int64> slice_limits_;
@@ -1755,10 +1749,6 @@ class HloInstruction {
   // Only present for kRng.
   RandomDistribution distribution_;
 
-  // Represents a unique identifier for each Send/Recv instruction pair.
-  // Only present for kSend or kRecv.
-  int64 channel_id_ = -1;
-
   // The string representation of the infeed configuration.
   string infeed_config_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index adbebb135b..109bf1a9bd 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+
 namespace xla {
 
+using ::tensorflow::str_util::Join;
 using ::tensorflow::strings::StrCat;
 
 HloBatchNormInstruction::HloBatchNormInstruction(
@@ -38,13 +41,6 @@ bool HloBatchNormInstruction::IdenticalSlowPath(
          epsilon() == casted_other.epsilon();
 }
 
-std::vector<string> HloBatchNormInstruction::ExtraAttributesToString(
-    const HloPrintOptions& options) const {
-  std::vector<string> extra = {StrCat("epsilon=", epsilon()),
-                               StrCat("feature_index=", feature_index())};
-  return extra;
-}
-
 HloInstructionProto HloBatchNormInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_epsilon(epsilon_);
@@ -52,6 +48,12 @@ HloInstructionProto HloBatchNormInstruction::ToProto() const {
   return proto;
 }
 
+std::vector<string> HloBatchNormInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("epsilon=", epsilon()),
+          StrCat("feature_index=", feature_index())};
+}
+
 HloBatchNormTrainingInstruction::HloBatchNormTrainingInstruction(
     const Shape& shape, HloInstruction* operand, HloInstruction* scale,
     HloInstruction* offset, float epsilon, int64 feature_index)
@@ -115,4 +117,138 @@ HloBatchNormGradInstruction::CloneWithNewOperandsImpl(
       new_operands[4], epsilon(), feature_index());
 }
 
+HloFftInstruction::HloFftInstruction(
+    const Shape& shape, HloInstruction* operand, FftType fft_type,
+    tensorflow::gtl::ArraySlice<int64> fft_length)
+    : HloInstruction(HloOpcode::kFft, shape), fft_type_(fft_type) {
+  fft_length_.assign(fft_length.begin(), fft_length.end());
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloFftInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_fft_type(fft_type_);
+  for (int64 fft_len : fft_length_) {
+    proto.add_fft_length(fft_len);
+  }
+  return proto;
+}
+
+std::vector<string> HloFftInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("fft_type=", FftType_Name(fft_type())),
+          StrCat("fft_length={", Join(fft_length(), ","), "}")};
+}
+
+bool HloFftInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloFftInstruction&>(other);
+  return fft_type() == casted_other.fft_type() &&
+         fft_length() == casted_other.fft_length();
+}
+
+std::unique_ptr<HloInstruction> HloFftInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloFftInstruction>(shape, new_operands[0], fft_type_,
+                                       fft_length_);
+}
+
+HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
+                                               const Shape& shape,
+                                               int64 channel_id)
+    : HloInstruction(opcode, shape), channel_id_(channel_id) {}
+
+HloInstructionProto HloSendRecvInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_channel_id(channel_id_);
+  return proto;
+}
+
+std::vector<string> HloSendRecvInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("channel_id=", channel_id_)};
+}
+
+bool HloSendRecvInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  // Not yet supported.
+  return false;
+}
+
+// Send instruction produces a tuple of {aliased operand, U32 context}.
+HloSendInstruction::HloSendInstruction(HloInstruction* operand,
+                                       int64 channel_id)
+    : HloSendRecvInstruction(
+          HloOpcode::kSend,
+          ShapeUtil::MakeTupleShape(
+              {CHECK_NOTNULL(operand)->shape(), ShapeUtil::MakeShape(U32, {})}),
+          channel_id) {
+  AppendOperand(operand);
+}
+
+std::unique_ptr<HloInstruction> HloSendInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloSendInstruction>(new_operands[0], channel_id());
+}
+
+HloSendDoneInstruction::HloSendDoneInstruction(HloSendInstruction* operand)
+    : HloSendRecvInstruction(HloOpcode::kSendDone, ShapeUtil::MakeNil(),
+                             CHECK_NOTNULL(operand)->channel_id()) {
+  AppendOperand(operand);
+}
+
+std::unique_ptr<HloInstruction>
+HloSendDoneInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloSendDoneInstruction>(
+      Cast<HloSendInstruction>(new_operands[0]));
+}
+
+// Recv instruction produces a tuple of {receive buffer, U32 context}.
+HloRecvInstruction::HloRecvInstruction(const Shape& shape, int64 channel_id)
+    : HloSendRecvInstruction(
+          HloOpcode::kRecv,
+          ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeShape(U32, {})}),
+          channel_id) {}
+
+std::unique_ptr<HloInstruction> HloRecvInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 0);
+  return MakeUnique<HloRecvInstruction>(
+      ShapeUtil::GetTupleElementShape(shape, 0), channel_id());
+}
+
+HloRecvDoneInstruction::HloRecvDoneInstruction(HloRecvInstruction* operand)
+    : HloSendRecvInstruction(
+          HloOpcode::kRecvDone,
+          ShapeUtil::GetTupleElementShape(operand->shape(), 0),
+          CHECK_NOTNULL(operand)->channel_id()) {
+  AppendOperand(operand);
+}
+
+std::unique_ptr<HloInstruction>
+HloRecvDoneInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloRecvDoneInstruction>(
+      Cast<HloRecvInstruction>(new_operands[0]));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 6fcd96a8c6..22d2fe6b27 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -32,19 +32,18 @@ class HloBatchNormInstruction : public HloInstruction {
   // number added to the variance to avoid divide-by-zero error.
   float epsilon() const { return epsilon_; }
 
-  // Returns string representation of op-specific attributes.
-  std::vector<string> ExtraAttributesToString(
-      const HloPrintOptions& options) const override;
-
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
 
  protected:
-  HloBatchNormInstruction(HloOpcode opcode, const Shape& shape,
-                          HloInstruction* operand, HloInstruction* scale,
-                          float epsilon, int64 feature_index);
+  explicit HloBatchNormInstruction(HloOpcode opcode, const Shape& shape,
+                                   HloInstruction* operand,
+                                   HloInstruction* scale, float epsilon,
+                                   int64 feature_index);
 
  private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
   bool IdenticalSlowPath(
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
@@ -58,9 +57,11 @@ class HloBatchNormInstruction : public HloInstruction {
 
 class HloBatchNormTrainingInstruction : public HloBatchNormInstruction {
  public:
-  HloBatchNormTrainingInstruction(const Shape& shape, HloInstruction* operand,
-                                  HloInstruction* scale, HloInstruction* offset,
-                                  float epsilon, int64 feature_index);
+  explicit HloBatchNormTrainingInstruction(const Shape& shape,
+                                           HloInstruction* operand,
+                                           HloInstruction* scale,
+                                           HloInstruction* offset,
+                                           float epsilon, int64 feature_index);
 
  private:
   // Implementation for non-common logic of CloneWithNewOperands.
@@ -72,11 +73,10 @@ class HloBatchNormTrainingInstruction : public HloBatchNormInstruction {
 
 class HloBatchNormInferenceInstruction : public HloBatchNormInstruction {
  public:
-  HloBatchNormInferenceInstruction(const Shape& shape, HloInstruction* operand,
-                                   HloInstruction* scale,
-                                   HloInstruction* offset, HloInstruction* mean,
-                                   HloInstruction* variance, float epsilon,
-                                   int64 feature_index);
+  explicit HloBatchNormInferenceInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* offset, HloInstruction* mean, HloInstruction* variance,
+      float epsilon, int64 feature_index);
 
  private:
   // Implementation for non-common logic of CloneWithNewOperands.
@@ -88,11 +88,116 @@ class HloBatchNormInferenceInstruction : public HloBatchNormInstruction {
 
 class HloBatchNormGradInstruction : public HloBatchNormInstruction {
  public:
-  HloBatchNormGradInstruction(const Shape& shape, HloInstruction* operand,
-                              HloInstruction* scale, HloInstruction* mean,
-                              HloInstruction* variance,
-                              HloInstruction* grad_output, float epsilon,
-                              int64 feature_index);
+  explicit HloBatchNormGradInstruction(
+      const Shape& shape, HloInstruction* operand, HloInstruction* scale,
+      HloInstruction* mean, HloInstruction* variance,
+      HloInstruction* grad_output, float epsilon, int64 feature_index);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloFftInstruction : public HloInstruction {
+ public:
+  explicit HloFftInstruction(const Shape& shape, HloInstruction* operand,
+                             FftType fft_type,
+                             tensorflow::gtl::ArraySlice<int64> fft_length);
+  FftType fft_type() const { return fft_type_; }
+
+  const std::vector<int64>& fft_length() const { return fft_length_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes FFT type for an FFT instruction.
+  FftType fft_type_ = FftType::FFT;
+
+  // Indicates the FFT length for an FFT instruction.
+  std::vector<int64> fft_length_;
+};
+
+class HloSendRecvInstruction : public HloInstruction {
+ public:
+  // Returns the channel id associated with the instruction. The id is
+  // shared between each Send/Recv pair and is globally unique to identify each
+  // channel.
+  int64 channel_id() const { return channel_id_; }
+
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ protected:
+  explicit HloSendRecvInstruction(HloOpcode opcode, const Shape& shape,
+                                  int64 channel_id);
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Represents a unique identifier for each Send/Recv instruction pair.
+  int64 channel_id_;
+};
+
+class HloSendInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloSendInstruction(HloInstruction* operand, int64 channel_id);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloSendDoneInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloSendDoneInstruction(HloSendInstruction* operand);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloRecvInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloRecvInstruction(const Shape& shape, int64 channel_id);
+
+ private:
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+};
+
+class HloRecvDoneInstruction : public HloSendRecvInstruction {
+ public:
+  explicit HloRecvDoneInstruction(HloRecvInstruction* operand);
 
  private:
   // Implementation for non-common logic of CloneWithNewOperands.
-- 
GitLab


From 80eb65f367c8a5b8a80e752984e001f2479761d6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 16:17:00 -0700
Subject: [PATCH 459/610] TOCO: return Status instead of crashing while
 converting "Conv".

PiperOrigin-RevId: 199714511
---
 .../contrib/lite/toco/import_tensorflow.cc    | 87 +++++++++++++------
 1 file changed, 62 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index b13a88a9eb..5cc999314c 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -48,6 +48,12 @@ limitations under the License.
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/public/version.h"
 
+#define TOCO_RETURN_IF_ERROR(...)                       \
+  do {                                                  \
+    const ::toco::port::Status _status = (__VA_ARGS__); \
+    if (!_status.ok()) return _status;                  \
+  } while (0)
+
 using tensorflow::AttrValue;
 using tensorflow::DT_BOOL;
 using tensorflow::DT_FLOAT;
@@ -130,6 +136,37 @@ const AttrValue::ListValue& GetListAttr(const NodeDef& node,
   return attr.list();
 }
 
+Status CheckOptionalAttr(const NodeDef& node, const string& attr_name,
+                         const string& expected_value) {
+  if (HasAttr(node, attr_name)) {
+    const string& value = GetStringAttr(node, attr_name);
+    if (value != expected_value) {
+      return Status(false, "Unexpected value for attribute '" + attr_name +
+                               "'. Expected '" + expected_value + "'");
+    }
+  }
+  return Status::OK();
+}
+Status CheckOptionalAttr(const NodeDef& node, const string& attr_name,
+                         const tensorflow::DataType& expected_value) {
+  if (HasAttr(node, attr_name)) {
+    const tensorflow::DataType& value = GetDataTypeAttr(node, attr_name);
+    if (value != expected_value) {
+      return Status(false, "Unexpected value for attribute '" + attr_name +
+                               "'. Expected '" +
+                               tensorflow::DataType_Name(expected_value) + "'");
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T1, typename T2>
+Status ExpectValue(const T1& v1, const T2& v2, const string& description) {
+  if (v1 == v2) return Status::OK();
+  return Status(false, absl::StrCat("Unexpected ", description, ": got ", v1,
+                                    ", expected ", v2));
+}
+
 ArrayDataType ConvertDataType(tensorflow::DataType dtype) {
   if (dtype == DT_UINT8)
     return ArrayDataType::kUint8;
@@ -466,18 +503,16 @@ Status ConvertConstOperator(const NodeDef& node,
   return status;
 }
 
-void ConvertConvOperator(const NodeDef& node,
-                         const TensorFlowImportFlags& tf_import_flags,
-                         Model* model) {
+Status ConvertConvOperator(const NodeDef& node,
+                           const TensorFlowImportFlags& tf_import_flags,
+                           Model* model) {
   CHECK_EQ(node.op(), "Conv2D");
   CheckInputsCount(node, tf_import_flags, 2);
 
   // We only support NHWC, which is the default data_format.
   // So if data_format is not defined, we're all good.
-  if (HasAttr(node, "data_format")) {
-    CHECK_EQ(GetStringAttr(node, "data_format"), "NHWC");
-  }
-  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  TOCO_RETURN_IF_ERROR(CheckOptionalAttr(node, "data_format", "NHWC"));
+  TOCO_RETURN_IF_ERROR(CheckOptionalAttr(node, "T", DT_FLOAT));
 
   const auto& input_name = node.input(0);
   const auto& weights_name = node.input(1);
@@ -502,27 +537,27 @@ void ConvertConvOperator(const NodeDef& node,
   auto* conv = new ConvOperator;
   conv->inputs = {input_name, reordered_weights_name};
   conv->outputs = {node.name()};
+  TOCO_RETURN_IF_ERROR(
+      Status(HasAttr(node, "strides"), "Missing attribute 'strides'"));
   const auto& strides = GetListAttr(node, "strides");
-  CHECK_EQ(strides.i_size(), 4);
-  CHECK_EQ(strides.i(0), 1);
-  CHECK_EQ(strides.i(3), 1);
+  TOCO_RETURN_IF_ERROR(ExpectValue(strides.i_size(), 4, "number of strides"));
+  TOCO_RETURN_IF_ERROR(ExpectValue(strides.i(0), 1, "strides(0)"));
+  TOCO_RETURN_IF_ERROR(ExpectValue(strides.i(3), 1, "strides(3)"));
   conv->stride_height = strides.i(1);
   conv->stride_width = strides.i(2);
   if (HasAttr(node, "dilations")) {
     const auto& dilations = GetListAttr(node, "dilations");
-    CHECK_EQ(dilations.i_size(), 4);
-    CHECK_EQ(dilations.i(0), 1)
-        << "Can only import Conv ops with dilation along the height (1st) or "
-           "width (2nd) axis. TensorFlow op \""
-        << node.name() << "\" had dilations:[ " << dilations.i(0) << ", "
-        << dilations.i(1) << ", " << dilations.i(2) << ", " << dilations.i(3)
-        << "].";
-    CHECK_EQ(dilations.i(3), 1)
-        << "Can only import Conv ops with dilation along the height (1st) or "
-           "width (2nd) axis. TensorFlow op \""
-        << node.name() << "\" had dilations:[ " << dilations.i(0) << ", "
-        << dilations.i(1) << ", " << dilations.i(2) << ", " << dilations.i(3)
-        << "].";
+    TOCO_RETURN_IF_ERROR(
+        ExpectValue(dilations.i_size(), 4, "number of dilations"));
+    if (dilations.i(0) != 1 || dilations.i(3) != 1) {
+      return Status(
+          false, absl::StrCat(
+                     "Can only import Conv ops with dilation along the height "
+                     "(1st) or width (2nd) axis. TensorFlow op \"",
+                     node.name(), "\" had dilations:[ ", dilations.i(0), ", ",
+                     dilations.i(1), ", ", dilations.i(2), ", ", dilations.i(3),
+                     "]."));
+    }
     conv->dilation_height_factor = dilations.i(1);
     conv->dilation_width_factor = dilations.i(2);
   } else {
@@ -535,9 +570,11 @@ void ConvertConvOperator(const NodeDef& node,
   } else if (padding == "VALID") {
     conv->padding.type = PaddingType::kValid;
   } else {
-    LOG(FATAL) << "Bad padding (only SAME and VALID are supported)";
+    return Status(false, "Bad padding (only SAME and VALID are supported)");
   }
   model->operators.emplace_back(conv);
+
+  return Status::OK();
 }
 
 void ConvertDepthwiseConvOperator(const NodeDef& node,
@@ -1722,7 +1759,7 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
   if (node.op() == "Const") {
     return ConvertConstOperator(node, tf_import_flags, model);
   } else if (node.op() == "Conv2D") {
-    ConvertConvOperator(node, tf_import_flags, model);
+    return ConvertConvOperator(node, tf_import_flags, model);
   } else if (node.op() == "Conv2DBackpropInput") {
     ConvertTransposeConvOperator(node, tf_import_flags, model);
   } else if (node.op() == "DepthwiseConv2dNative") {
-- 
GitLab


From 82f152ee75261afa3ae59ae7c9e18493d7e8b55e Mon Sep 17 00:00:00 2001
From: Shivani Agrawal <shivaniagrawal@google.com>
Date: Thu, 7 Jun 2018 16:44:51 -0700
Subject: [PATCH 460/610] [data-stats] Adds support to collect `features` and
 `feature-values` statistics from `Example` record of dataset.

This change-list also applies transformation function `feature_stats()` to collect stats in an associated stats_aggregator (if any) to dataset in `make_batched_feature_dataset()` by default.

PiperOrigin-RevId: 199718439
---
 .../contrib/data/python/kernel_tests/BUILD    |  28 ++-
 .../kernel_tests/reader_dataset_ops_test.py   | 207 +++--------------
 .../reader_dataset_ops_test_base.py           | 218 ++++++++++++++++++
 .../kernel_tests/stats_dataset_ops_test.py    |  45 +++-
 tensorflow/contrib/data/python/ops/BUILD      |   4 +-
 tensorflow/contrib/data/python/ops/readers.py |   3 +
 .../contrib/data/python/ops/stats_ops.py      |  21 ++
 .../api_def_FeatureStatsDataset.pbtxt         |   3 +
 .../api_def_FeatureStatsDataset.pbtxt         |   4 +
 tensorflow/core/kernels/data/BUILD            |   1 +
 .../core/kernels/data/stats_dataset_ops.cc    | 185 +++++++++++++++
 tensorflow/core/ops/dataset_ops.cc            |  12 +
 12 files changed, 547 insertions(+), 184 deletions(-)
 create mode 100644 tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
 create mode 100644 tensorflow/core/api_def/base_api/api_def_FeatureStatsDataset.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_FeatureStatsDataset.pbtxt

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index ba707d8d6e..fd15103870 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -330,6 +330,26 @@ py_test(
     ],
 )
 
+py_library(
+    name = "reader_dataset_ops_test_base",
+    testonly = 1,
+    srcs = [
+        "reader_dataset_ops_test_base.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:private"],
+    deps = [
+        "//tensorflow/contrib/data/python/ops:readers",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:readers",
+    ],
+)
+
 py_test(
     name = "reader_dataset_ops_test",
     size = "medium",
@@ -339,8 +359,8 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
+        ":reader_dataset_ops_test_base",
         "//tensorflow/contrib/data/python/ops:readers",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -352,6 +372,7 @@ py_test(
         "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
         "//third_party/py/numpy",
     ],
 )
@@ -478,10 +499,15 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
+        ":reader_dataset_ops_test_base",
         "//tensorflow/contrib/data/python/ops:stats_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
index e0237198b7..3b07ef290b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test.py
@@ -24,9 +24,8 @@ import zlib
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.contrib.data.python.ops import readers
-from tensorflow.core.example import example_pb2
-from tensorflow.core.example import feature_pb2
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.framework import constant_op
@@ -280,163 +279,8 @@ def _interleave(iterators, cycle_length):
           num_open -= 1
 
 
-class ReadBatchFeaturesTest(test.TestCase):
-
-  def setUp(self):
-    super(ReadBatchFeaturesTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-    self.test_filenames = self._createFiles()
-
-  def _read_batch_features(self,
-                           filenames,
-                           num_epochs,
-                           batch_size,
-                           reader_num_threads=1,
-                           parser_num_threads=1,
-                           shuffle=False,
-                           shuffle_seed=None,
-                           drop_final_batch=False):
-    self.filenames = filenames
-    self.num_epochs = num_epochs
-    self.batch_size = batch_size
-
-    return readers.make_batched_features_dataset(
-        file_pattern=self.filenames,
-        batch_size=self.batch_size,
-        features={
-            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
-            "record": parsing_ops.FixedLenFeature([], dtypes.int64),
-            "keywords": parsing_ops.VarLenFeature(dtypes.string)
-        },
-        reader=core_readers.TFRecordDataset,
-        num_epochs=self.num_epochs,
-        shuffle=shuffle,
-        shuffle_seed=shuffle_seed,
-        reader_num_threads=reader_num_threads,
-        parser_num_threads=parser_num_threads,
-        drop_final_batch=drop_final_batch).make_one_shot_iterator(
-        ).get_next()
-
-  def _record(self, f, r):
-    example = example_pb2.Example(
-        features=feature_pb2.Features(
-            feature={
-                "file":
-                    feature_pb2.Feature(
-                        int64_list=feature_pb2.Int64List(value=[f])),
-                "record":
-                    feature_pb2.Feature(
-                        int64_list=feature_pb2.Int64List(value=[r])),
-                "keywords":
-                    feature_pb2.Feature(
-                        bytes_list=feature_pb2.BytesList(
-                            value=self._get_keywords(f, r)))
-            }))
-    return example.SerializeToString()
-
-  def _get_keywords(self, f, r):
-    num_keywords = 1 + (f + r) % 2
-    keywords = []
-    for index in range(num_keywords):
-      keywords.append(compat.as_bytes("keyword%d" % index))
-    return keywords
-
-  def _createFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      writer = python_io.TFRecordWriter(fn)
-      for j in range(self._num_records):
-        writer.write(self._record(i, j))
-      writer.close()
-    return filenames
-
-  def _run_actual_batch(self, outputs, sess):
-    file_op = outputs["file"]
-    keywords_indices_op = outputs["keywords"].indices
-    keywords_values_op = outputs["keywords"].values
-    keywords_dense_shape_op = outputs["keywords"].dense_shape
-    record_op = outputs["record"]
-    return sess.run([
-        file_op, keywords_indices_op, keywords_values_op,
-        keywords_dense_shape_op, record_op
-    ])
-
-  def _next_actual_batch(self, sess):
-    return self._run_actual_batch(self.outputs, sess)
-
-  def _next_expected_batch(self,
-                           file_indices,
-                           batch_size,
-                           num_epochs,
-                           cycle_length=1):
-
-    def _next_record(file_indices):
-      for j in file_indices:
-        for i in range(self._num_records):
-          yield j, i
-
-    def _next_record_interleaved(file_indices, cycle_length):
-      return _interleave([_next_record([i]) for i in file_indices],
-                         cycle_length)
-
-    file_batch = []
-    keywords_batch_indices = []
-    keywords_batch_values = []
-    keywords_batch_max_len = 0
-    record_batch = []
-    batch_index = 0
-    for _ in range(num_epochs):
-      if cycle_length == 1:
-        next_records = _next_record(file_indices)
-      else:
-        next_records = _next_record_interleaved(file_indices, cycle_length)
-      for record in next_records:
-        f = record[0]
-        r = record[1]
-        file_batch.append(f)
-        record_batch.append(r)
-        keywords = self._get_keywords(f, r)
-        keywords_batch_values.extend(keywords)
-        keywords_batch_indices.extend(
-            [[batch_index, i] for i in range(len(keywords))])
-        batch_index += 1
-        keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
-        if len(file_batch) == batch_size:
-          yield [
-              file_batch, keywords_batch_indices, keywords_batch_values,
-              [batch_size, keywords_batch_max_len], record_batch
-          ]
-          file_batch = []
-          keywords_batch_indices = []
-          keywords_batch_values = []
-          keywords_batch_max_len = 0
-          record_batch = []
-          batch_index = 0
-    if file_batch:
-      yield [
-          file_batch, keywords_batch_indices, keywords_batch_values,
-          [len(file_batch), keywords_batch_max_len], record_batch
-      ]
-
-  def _verify_records(self,
-                      sess,
-                      batch_size,
-                      file_index=None,
-                      num_epochs=1,
-                      interleave_cycle_length=1):
-    if file_index is not None:
-      file_indices = [file_index]
-    else:
-      file_indices = range(self._num_files)
-
-    for expected_batch in self._next_expected_batch(
-        file_indices, batch_size, num_epochs, interleave_cycle_length):
-      actual_batch = self._next_actual_batch(sess)
-      for i in range(len(expected_batch)):
-        self.assertAllEqual(expected_batch[i], actual_batch[i])
+class ReadBatchFeaturesTest(
+    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase):
 
   def testRead(self):
     for batch_size in [1, 2]:
@@ -444,33 +288,33 @@ class ReadBatchFeaturesTest(test.TestCase):
         with ops.Graph().as_default() as g:
           with self.test_session(graph=g) as sess:
             # Basic test: read from file 0.
-            self.outputs = self._read_batch_features(
+            self.outputs = self.make_batch_feature(
                 filenames=self.test_filenames[0],
                 num_epochs=num_epochs,
-                batch_size=batch_size)
-            self._verify_records(sess, batch_size, 0, num_epochs=num_epochs)
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(sess, batch_size, 0, num_epochs=num_epochs)
             with self.assertRaises(errors.OutOfRangeError):
               self._next_actual_batch(sess)
 
         with ops.Graph().as_default() as g:
           with self.test_session(graph=g) as sess:
             # Basic test: read from file 1.
-            self.outputs = self._read_batch_features(
+            self.outputs = self.make_batch_feature(
                 filenames=self.test_filenames[1],
                 num_epochs=num_epochs,
-                batch_size=batch_size)
-            self._verify_records(sess, batch_size, 1, num_epochs=num_epochs)
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(sess, batch_size, 1, num_epochs=num_epochs)
             with self.assertRaises(errors.OutOfRangeError):
               self._next_actual_batch(sess)
 
         with ops.Graph().as_default() as g:
           with self.test_session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self._read_batch_features(
+            self.outputs = self.make_batch_feature(
                 filenames=self.test_filenames,
                 num_epochs=num_epochs,
-                batch_size=batch_size)
-            self._verify_records(sess, batch_size, num_epochs=num_epochs)
+                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.verify_records(sess, batch_size, num_epochs=num_epochs)
             with self.assertRaises(errors.OutOfRangeError):
               self._next_actual_batch(sess)
 
@@ -504,18 +348,18 @@ class ReadBatchFeaturesTest(test.TestCase):
       # Test that shuffling with same seed produces the same result.
       with ops.Graph().as_default() as g:
         with self.test_session(graph=g) as sess:
-          outputs1 = self._read_batch_features(
+          outputs1 = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5)
-          outputs2 = self._read_batch_features(
+              shuffle_seed=5).make_one_shot_iterator().get_next()
+          outputs2 = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5)
+              shuffle_seed=5).make_one_shot_iterator().get_next()
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
             batch2 = self._run_actual_batch(outputs2, sess)
@@ -525,18 +369,18 @@ class ReadBatchFeaturesTest(test.TestCase):
       # Test that shuffling with different seeds produces a different order.
       with ops.Graph().as_default() as g:
         with self.test_session(graph=g) as sess:
-          outputs1 = self._read_batch_features(
+          outputs1 = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5)
-          outputs2 = self._read_batch_features(
+              shuffle_seed=5).make_one_shot_iterator().get_next()
+          outputs2 = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=15)
+              shuffle_seed=15).make_one_shot_iterator().get_next()
           all_equal = True
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
@@ -552,13 +396,14 @@ class ReadBatchFeaturesTest(test.TestCase):
         for parser_num_threads in [2, 4]:
           with ops.Graph().as_default() as g:
             with self.test_session(graph=g) as sess:
-              self.outputs = self._read_batch_features(
+              self.outputs = self.make_batch_feature(
                   filenames=self.test_filenames,
                   num_epochs=num_epochs,
                   batch_size=batch_size,
                   reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads)
-              self._verify_records(
+                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
+                  ).get_next()
+              self.verify_records(
                   sess,
                   batch_size,
                   num_epochs=num_epochs,
@@ -571,11 +416,11 @@ class ReadBatchFeaturesTest(test.TestCase):
       for num_epochs in [1, 10]:
         with ops.Graph().as_default():
           # Basic test: read from file 0.
-          self.outputs = self._read_batch_features(
+          self.outputs = self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
-              drop_final_batch=True)
+              drop_final_batch=True).make_one_shot_iterator().get_next()
           for _, tensor in self.outputs.items():
             if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
               self.assertEqual(tensor.shape[0], batch_size)
diff --git a/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
new file mode 100644
index 0000000000..805a7c7b73
--- /dev/null
+++ b/tensorflow/contrib/data/python/kernel_tests/reader_dataset_ops_test_base.py
@@ -0,0 +1,218 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the experimental input pipeline ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.contrib.data.python.ops import readers
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.framework import dtypes
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class ReadBatchFeaturesTestBase(test.TestCase):
+  """Base class for setting up and testing `make_batched_feature_dataset`."""
+
+  def setUp(self):
+    super(ReadBatchFeaturesTestBase, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self.test_filenames = self._createFiles()
+
+  def make_batch_feature(self,
+                         filenames,
+                         num_epochs,
+                         batch_size,
+                         reader_num_threads=1,
+                         parser_num_threads=1,
+                         shuffle=False,
+                         shuffle_seed=None,
+                         drop_final_batch=False):
+    self.filenames = filenames
+    self.num_epochs = num_epochs
+    self.batch_size = batch_size
+
+    return readers.make_batched_features_dataset(
+        file_pattern=self.filenames,
+        batch_size=self.batch_size,
+        features={
+            "file": parsing_ops.FixedLenFeature([], dtypes.int64),
+            "record": parsing_ops.FixedLenFeature([], dtypes.int64),
+            "keywords": parsing_ops.VarLenFeature(dtypes.string)
+        },
+        reader=core_readers.TFRecordDataset,
+        num_epochs=self.num_epochs,
+        shuffle=shuffle,
+        shuffle_seed=shuffle_seed,
+        reader_num_threads=reader_num_threads,
+        parser_num_threads=parser_num_threads,
+        drop_final_batch=drop_final_batch)
+
+  def _record(self, f, r):
+    example = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                "file":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[f])),
+                "record":
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[r])),
+                "keywords":
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=self._get_keywords(f, r)))
+            }))
+    return example.SerializeToString()
+
+  def _get_keywords(self, f, r):
+    num_keywords = 1 + (f + r) % 2
+    keywords = []
+    for index in range(num_keywords):
+      keywords.append(compat.as_bytes("keyword%d" % index))
+    return keywords
+
+  def _sum_keywords(self, num_files):
+    sum_keywords = 0
+    for i in range(num_files):
+      for j in range(self._num_records):
+        sum_keywords += 1 + (i + j) % 2
+    return sum_keywords
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def _run_actual_batch(self, outputs, sess):
+    file_op = outputs["file"]
+    keywords_indices_op = outputs["keywords"].indices
+    keywords_values_op = outputs["keywords"].values
+    keywords_dense_shape_op = outputs["keywords"].dense_shape
+    record_op = outputs["record"]
+    return sess.run([
+        file_op, keywords_indices_op, keywords_values_op,
+        keywords_dense_shape_op, record_op
+    ])
+
+  def _next_actual_batch(self, sess):
+    return self._run_actual_batch(self.outputs, sess)
+
+  def _interleave(self, iterators, cycle_length):
+    pending_iterators = iterators
+    open_iterators = []
+    num_open = 0
+    for i in range(cycle_length):
+      if pending_iterators:
+        open_iterators.append(pending_iterators.pop(0))
+        num_open += 1
+
+    while num_open:
+      for i in range(min(cycle_length, len(open_iterators))):
+        if open_iterators[i] is None:
+          continue
+        try:
+          yield next(open_iterators[i])
+        except StopIteration:
+          if pending_iterators:
+            open_iterators[i] = pending_iterators.pop(0)
+          else:
+            open_iterators[i] = None
+            num_open -= 1
+
+  def _next_expected_batch(self,
+                           file_indices,
+                           batch_size,
+                           num_epochs,
+                           cycle_length=1):
+
+    def _next_record(file_indices):
+      for j in file_indices:
+        for i in range(self._num_records):
+          yield j, i
+
+    def _next_record_interleaved(file_indices, cycle_length):
+      return self._interleave([_next_record([i]) for i in file_indices],
+                              cycle_length)
+
+    file_batch = []
+    keywords_batch_indices = []
+    keywords_batch_values = []
+    keywords_batch_max_len = 0
+    record_batch = []
+    batch_index = 0
+    for _ in range(num_epochs):
+      if cycle_length == 1:
+        next_records = _next_record(file_indices)
+      else:
+        next_records = _next_record_interleaved(file_indices, cycle_length)
+      for record in next_records:
+        f = record[0]
+        r = record[1]
+        file_batch.append(f)
+        record_batch.append(r)
+        keywords = self._get_keywords(f, r)
+        keywords_batch_values.extend(keywords)
+        keywords_batch_indices.extend(
+            [[batch_index, i] for i in range(len(keywords))])
+        batch_index += 1
+        keywords_batch_max_len = max(keywords_batch_max_len, len(keywords))
+        if len(file_batch) == batch_size:
+          yield [
+              file_batch, keywords_batch_indices, keywords_batch_values,
+              [batch_size, keywords_batch_max_len], record_batch
+          ]
+          file_batch = []
+          keywords_batch_indices = []
+          keywords_batch_values = []
+          keywords_batch_max_len = 0
+          record_batch = []
+          batch_index = 0
+    if file_batch:
+      yield [
+          file_batch, keywords_batch_indices, keywords_batch_values,
+          [len(file_batch), keywords_batch_max_len], record_batch
+      ]
+
+  def verify_records(self,
+                     sess,
+                     batch_size,
+                     file_index=None,
+                     num_epochs=1,
+                     interleave_cycle_length=1):
+    if file_index is not None:
+      file_indices = [file_index]
+    else:
+      file_indices = range(self._num_files)
+
+    for expected_batch in self._next_expected_batch(
+        file_indices, batch_size, num_epochs, interleave_cycle_length):
+      actual_batch = self._next_actual_batch(sess)
+      for i in range(len(expected_batch)):
+        self.assertAllEqual(expected_batch[i], actual_batch[i])
diff --git a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
index 5c74ed6ae7..17b6644759 100644
--- a/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/stats_dataset_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.contrib.data.python.ops import stats_ops
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -29,7 +30,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
-class StatsDatasetTest(test.TestCase):
+class StatsDatasetTestBase(test.TestCase):
 
   def _assertSummaryHasCount(self, summary_str, tag, expected_value):
     summary_proto = summary_pb2.Summary()
@@ -49,6 +50,9 @@ class StatsDatasetTest(test.TestCase):
         return
     self.fail("Expected tag %r not found in summary %r" % (tag, summary_proto))
 
+
+class StatsDatasetTest(StatsDatasetTestBase):
+
   def testBytesProduced(self):
     stats_aggregator = stats_ops.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
@@ -193,6 +197,45 @@ class StatsDatasetTest(test.TestCase):
       self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
 
 
+class FeatureStatsDatasetTest(
+    StatsDatasetTestBase,
+    reader_dataset_ops_test_base.ReadBatchFeaturesTestBase):
+
+  def testFeaturesStats(self):
+    num_epochs = 5
+    total_records = num_epochs * self._num_records
+    batch_size = 2
+    stats_aggregator = stats_ops.StatsAggregator()
+    dataset = self.make_batch_feature(
+        filenames=self.test_filenames[0],
+        num_epochs=num_epochs,
+        batch_size=batch_size,
+        shuffle=True,
+        shuffle_seed=5,
+        drop_final_batch=True).apply(
+            stats_ops.set_stats_aggregator(stats_aggregator))
+    iterator = dataset.make_initializable_iterator()
+    next_element = iterator.get_next()
+    summary_t = stats_aggregator.get_summary()
+
+    with self.test_session() as sess:
+      sess.run(iterator.initializer)
+      for _ in range(total_records // batch_size):
+        sess.run(next_element)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "record_stats:features", total_records)
+      self._assertSummaryHasCount(
+          sess.run(summary_t), "record_stats:feature-values", total_records)
+      self._assertSummaryHasSum(
+          sess.run(summary_t), "record_stats:features", total_records * 3)
+      self._assertSummaryHasSum(
+          sess.run(summary_t), "record_stats:feature-values",
+          self._sum_keywords(1) * num_epochs + 2 * total_records)
+
+
 class StatsDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 086661adb7..fc8ec5961c 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -96,8 +96,10 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":batching",
+        ":gen_dataset_ops",
         ":interleave_ops",
         ":shuffle_ops",
+        ":stats_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
@@ -106,12 +108,12 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index f938153f5f..83095c7ba1 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -26,6 +26,7 @@ from tensorflow.contrib.data.python.ops import batching
 from tensorflow.contrib.data.python.ops import gen_dataset_ops as contrib_gen_dataset_ops
 from tensorflow.contrib.data.python.ops import interleave_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
+from tensorflow.contrib.data.python.ops import stats_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import convert
@@ -754,6 +755,8 @@ def make_batched_features_dataset(file_pattern,
   dataset = _maybe_shuffle_and_repeat(
       dataset, num_epochs, shuffle, shuffle_buffer_size, shuffle_seed)
 
+  dataset = dataset.apply(stats_ops.feature_stats("record_stats"))
+
   if drop_final_batch:
     dataset = dataset.apply(batching.batch_and_drop_remainder(batch_size))
   else:
diff --git a/tensorflow/contrib/data/python/ops/stats_ops.py b/tensorflow/contrib/data/python/ops/stats_ops.py
index 3cbaab5aff..8c30202ba7 100644
--- a/tensorflow/contrib/data/python/ops/stats_ops.py
+++ b/tensorflow/contrib/data/python/ops/stats_ops.py
@@ -176,6 +176,27 @@ def latency_stats(tag):
   return _apply_fn
 
 
+def feature_stats(tag):
+  """Records the features stats from `Example` records of the input dataset.
+
+  To consume the statistics, associate a `StatsAggregator` with the output
+  dataset.
+
+  Args:
+    tag: String. All statistics recorded by the returned transformation will be
+      associated with the given `tag`.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    @{tf.data.Dataset.apply}.
+  """
+
+  def _apply_fn(dataset):
+    return _StatsDataset(dataset, gen_dataset_ops.feature_stats_dataset, tag)
+
+  return _apply_fn
+
+
 class _StatsDataset(dataset_ops.Dataset):
   """A `Dataset` that acts as an identity, and also records statistics."""
 
diff --git a/tensorflow/core/api_def/base_api/api_def_FeatureStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FeatureStatsDataset.pbtxt
new file mode 100644
index 0000000000..ffd01ba5cc
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_FeatureStatsDataset.pbtxt
@@ -0,0 +1,3 @@
+op {
+  graph_op_name: "FeatureStatsDataset"
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_FeatureStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FeatureStatsDataset.pbtxt
new file mode 100644
index 0000000000..7f721f4fb7
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_FeatureStatsDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "FeatureStatsDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index da330e742e..6d2a04aa25 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -358,6 +358,7 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/stats_dataset_ops.cc
index 7370a24b38..3e0a6ae049 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/stats_dataset_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/example/example.pb.h"
+#include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -234,6 +236,189 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
+class FeatureStatsDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit FeatureStatsDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    string tag;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "tag", &tag));
+    OP_REQUIRES(ctx, input->output_dtypes()[0] == DT_STRING,
+                errors::InvalidArgument("FeatureStatsDataset only supports "
+                                        "input with a single `tf.string` "
+                                        "component."));
+    *output = new Dataset(ctx, input, std::move(tag));
+  }
+
+ private:
+  class Dataset : public GraphDatasetBase {
+   public:
+    explicit Dataset(OpKernelContext* ctx, const DatasetBase* input, string tag)
+        : GraphDatasetBase(ctx), input_(input), tag_(std::move(tag)) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::FeatureStatsDataset")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "FeatureStatsDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_node));
+      Node* tag_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        tf_shared_lock l(mu_);
+        Status s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
+        auto stats_aggregator = ctx->stats_aggregator();
+        if (stats_aggregator && s.ok() && !*end_of_sequence) {
+          for (const Tensor& t : *out_tensors) {
+            auto record_t = t.flat<string>();
+            Example example;
+            // TODO(shivaniagrawal): redundant parsing here, potential solutions
+            // to improve performance is to a) have a potential
+            // ParseExampleDataset and collect stats from there and b) make
+            // changes to parse_example() where it returns stats as well.
+            for (int i = 0; i < record_t.size(); ++i) {
+              if (example.ParseFromString(record_t(i))) {
+                AddStatsFeatures(example, stats_aggregator);
+              } else {
+                SequenceExample sequence_example;
+                if (sequence_example.ParseFromString(record_t(i))) {
+                  AddStatsFeatures(sequence_example, stats_aggregator);
+                }
+              }
+            }
+          }
+        }
+        return s;
+      }
+
+      // TODO(shivaniagrawal): Add features/feature-values to streamz metrics.
+      int AddStatsFeatureValues(const Feature& feature) {
+        int feature_values_list_size = 0;
+        switch (feature.kind_case()) {
+          case Feature::kBytesList: {
+            feature_values_list_size = feature.bytes_list().value().size();
+            break;
+          }
+          case Feature::kFloatList: {
+            feature_values_list_size = feature.float_list().value().size();
+            break;
+          }
+          case Feature::kInt64List: {
+            feature_values_list_size = feature.int64_list().value().size();
+            break;
+          }
+          case Feature::KIND_NOT_SET:
+            break;
+        }
+        return feature_values_list_size;
+      }
+
+      void AddStatsFeatures(
+          const Example& example,
+          const std::shared_ptr<StatsAggregator>& stats_aggregator) {
+        stats_aggregator->AddToHistogram(
+            strings::StrCat(dataset()->tag_, ":features"),
+            {static_cast<double>(example.features().feature().size())});
+
+        int feature_values_list_size_sum = 0;
+        for (const auto& feature : example.features().feature()) {
+          feature_values_list_size_sum += AddStatsFeatureValues(feature.second);
+        }
+        stats_aggregator->AddToHistogram(
+            strings::StrCat(dataset()->tag_, ":feature-values"),
+            {static_cast<double>(feature_values_list_size_sum)});
+      }
+
+      void AddStatsFeatures(
+          const SequenceExample& example,
+          const std::shared_ptr<StatsAggregator>& stats_aggregator) {
+        stats_aggregator->AddToHistogram(
+            strings::StrCat(dataset()->tag_, ":features"),
+            {static_cast<double>(
+                example.context().feature().size() +
+                example.feature_lists().feature_list().size())});
+
+        int feature_values_list_size_sum = 0;
+        for (const auto& feature : example.context().feature()) {
+          feature_values_list_size_sum += AddStatsFeatureValues(feature.second);
+        }
+
+        for (const auto& feature_list :
+             example.feature_lists().feature_list()) {
+          for (const auto& feature : feature_list.second.feature()) {
+            feature_values_list_size_sum += AddStatsFeatureValues(feature);
+          }
+        }
+
+        stats_aggregator->AddToHistogram(
+            strings::StrCat(dataset()->tag_, ":feature-values"),
+            {static_cast<double>(feature_values_list_size_sum)});
+      }
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        return Status::OK();
+      }
+
+     private:
+      mutex mu_;
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const string tag_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(Name("FeatureStatsDataset").Device(DEVICE_CPU),
+                        FeatureStatsDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
                         LatencyStatsDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 9bc6c9a30d..0e13d41977 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -166,6 +166,18 @@ REGISTER_OP("LatencyStatsDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("FeatureStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("SetStatsAggregatorDataset")
     .Input("input_dataset: variant")
     .Input("stats_aggregator: resource")
-- 
GitLab


From 2bf2799ee80791107d4fe587ff9b6c7cf6c8b418 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 7 Jun 2018 16:49:27 -0700
Subject: [PATCH 461/610] C API: Fail gracefully if the serialized graph would
 be too large.

See #19657 for some motivation.
Without this explicit check, a large graph would trigger an assertion failure
in the protobuf codebase
(https://github.com/google/protobuf/blob/0456e269ee6505766474aa8d7b8bba7ac047f457/src/google/protobuf/message_lite.cc#L68)

Pull Request for google/protobuf: https://github.com/google/protobuf/pull/4739

PiperOrigin-RevId: 199719082
---
 tensorflow/c/c_api.cc | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index b86b277ac3..cb0b093ad2 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -631,7 +631,22 @@ Status MessageToBuffer(const tensorflow::protobuf::Message& in,
         "Failed to allocate memory to serialize message of type '",
         in.GetTypeName(), "' and size ", proto_size);
   }
-  in.SerializeToArray(buf, proto_size);
+  // SerializeToArray takes size as an int.
+  // This next 'if' is a workaround till we update to depend on a version
+  // of protocol buffers that includes
+  // https://github.com/google/protobuf/pull/4739
+  if (proto_size > std::numeric_limits<int>::max()) {
+    return InvalidArgument("Cannot serialize protocol buffer of type ",
+                           in.GetTypeName(), " as the serialized size (",
+                           proto_size,
+                           "bytes) would be larger than the limit (",
+                           std::numeric_limits<int>::max(), " bytes)");
+  }
+  if (!in.SerializeToArray(buf, proto_size)) {
+    return InvalidArgument("Unable to serialize ", in.GetTypeName(),
+                           " protocol buffer, perhaps the serialized size (",
+                           proto_size, " bytes) is too large?");
+  }
   out->data = buf;
   out->length = proto_size;
   out->data_deallocator = [](void* data, size_t length) {
-- 
GitLab


From 3bb7a913be6ba47df6fb1796dd8ce639cdbf1608 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 17:18:10 -0700
Subject: [PATCH 462/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 199722844
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 27 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 27 +++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 1b4bec7bc8..71f34b3abe 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -22112,6 +22112,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FeatureStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Fill"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 1dfaeeabad..718c1510ed 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -10269,6 +10269,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "FeatureStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "Fill"
   input_arg {
-- 
GitLab


From 138e790ab9cb778430168d2b5f6abac1501aa2d8 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Thu, 7 Jun 2018 17:19:25 -0700
Subject: [PATCH 463/610] [XLA] Handle kSlice correctly in HloCostAnalysis

Slice doesn't read the entire input. It only reads enough to make the output.

PiperOrigin-RevId: 199722987
---
 .../compiler/xla/service/hlo_cost_analysis.cc     |  3 ++-
 .../xla/service/hlo_cost_analysis_test.cc         | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 94c9c7eabc..b9d30ee802 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -172,7 +172,8 @@ Status HloCostAnalysis::HandleReverse(const HloInstruction*) {
   return Status::OK();
 }
 
-Status HloCostAnalysis::HandleSlice(const HloInstruction*) {
+Status HloCostAnalysis::HandleSlice(const HloInstruction* slice) {
+  current_properties_[kBytesAccessedKey] = shape_size_(slice->shape()) * 2;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 16fdda8a8b..72adf09c83 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -460,5 +460,20 @@ TEST_F(HloCostAnalysisTest, BaseDilatedConvolution) {
   EXPECT_EQ(analysis.flop_count(), 1472);
 }
 
+TEST_F(HloCostAnalysisTest, Slice) {
+  // Test the analysis on a slice.
+  XlaBuilder builder("slice");
+  auto x = builder.Parameter(0, ShapeUtil::MakeShape(F32, {2}), "x");
+  auto slice = builder.Slice(x, {0}, {1}, {1});
+  auto hlo_module = BuildHloGraph(&builder);
+
+  // Run HLO cost analysis.
+  HloCostAnalysis analysis(ShapeSize);
+  ASSERT_IS_OK(
+      hlo_module->entry_computation()->root_instruction()->Accept(&analysis));
+
+  EXPECT_EQ(analysis.bytes_accessed(), 8);
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From fba60ec27f4d415dafdf2ee916e2aa2004fa9635 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 17:50:34 -0700
Subject: [PATCH 464/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 199726426

---
 tensorflow/go/op/wrappers.go | 196 +++++++++++++++++------------------
 1 file changed, 98 insertions(+), 98 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 6fc7087cb1..cdfd4b30e6 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -7579,6 +7579,69 @@ func DepthwiseConv2dNativeBackpropFilter(scope *Scope, input tf.Output, filter_s
 	return op.Output(0)
 }
 
+// Returns immutable tensor from memory region.
+//
+// The current implementation memmaps the tensor from a file.
+//
+// Arguments:
+//	dtype: Type of the returned tensor.
+//	shape: Shape of the returned tensor.
+//	memory_region_name: Name of readonly memory region used by the tensor, see
+// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
+func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
+	opspec := tf.OpSpec{
+		Type: "ImmutableConst",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StringJoinAttr is an optional argument to StringJoin.
+type StringJoinAttr func(optionalAttr)
+
+// StringJoinSeparator sets the optional separator attribute to value.
+//
+// value: string, an optional join separator.
+// If not specified, defaults to ""
+func StringJoinSeparator(value string) StringJoinAttr {
+	return func(m optionalAttr) {
+		m["separator"] = value
+	}
+}
+
+// Joins the strings in the given list of string tensors into one tensor;
+//
+// with the given separator (default is an empty separator).
+//
+// Arguments:
+//	inputs: A list of string tensors.  The tensors must all have the same shape,
+// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
+// of non-scalar inputs.
+func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringJoin",
+		Input: []tf.Input{
+			tf.OutputList(inputs),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // LRNGradAttr is an optional argument to LRNGrad.
 type LRNGradAttr func(optionalAttr)
 
@@ -17648,69 +17711,6 @@ func DeserializeManySparse(scope *Scope, serialized_sparse tf.Output, dtype tf.D
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// StringJoinAttr is an optional argument to StringJoin.
-type StringJoinAttr func(optionalAttr)
-
-// StringJoinSeparator sets the optional separator attribute to value.
-//
-// value: string, an optional join separator.
-// If not specified, defaults to ""
-func StringJoinSeparator(value string) StringJoinAttr {
-	return func(m optionalAttr) {
-		m["separator"] = value
-	}
-}
-
-// Joins the strings in the given list of string tensors into one tensor;
-//
-// with the given separator (default is an empty separator).
-//
-// Arguments:
-//	inputs: A list of string tensors.  The tensors must all have the same shape,
-// or be scalars.  Scalars may be mixed in; these will be broadcast to the shape
-// of non-scalar inputs.
-func StringJoin(scope *Scope, inputs []tf.Output, optional ...StringJoinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringJoin",
-		Input: []tf.Input{
-			tf.OutputList(inputs),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Returns immutable tensor from memory region.
-//
-// The current implementation memmaps the tensor from a file.
-//
-// Arguments:
-//	dtype: Type of the returned tensor.
-//	shape: Shape of the returned tensor.
-//	memory_region_name: Name of readonly memory region used by the tensor, see
-// NewReadOnlyMemoryRegionFromFile in tensorflow::Env.
-func ImmutableConst(scope *Scope, dtype tf.DataType, shape tf.Shape, memory_region_name string) (tensor tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype, "shape": shape, "memory_region_name": memory_region_name}
-	opspec := tf.OpSpec{
-		Type: "ImmutableConst",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse real-valued fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
@@ -25053,6 +25053,41 @@ func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, o
 	return op.Output(0)
 }
 
+// Runs multiple additive regression ensemble predictors on input instances and
+//
+// computes the update to cached logits. It is designed to be used during training.
+// It traverses the trees starting from cached tree id and cached node id and
+// calculates the updates to be pushed to the cache.
+//
+// Arguments:
+//
+//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
+// tree of prediction.
+//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
+// node of prediction.
+//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
+// feature.
+//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
+// shape.
+//
+// Returns Rank 2 Tensor containing logits update (with respect to cached
+// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
+func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
+	opspec := tf.OpSpec{
+		Type: "BoostedTreesTrainingPredict",
+		Input: []tf.Input{
+			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
 // MapSizeAttr is an optional argument to MapSize.
 type MapSizeAttr func(optionalAttr)
 
@@ -29812,41 +29847,6 @@ func BoostedTreesDeserializeEnsemble(scope *Scope, tree_ensemble_handle tf.Outpu
 	return scope.AddOperation(opspec)
 }
 
-// Runs multiple additive regression ensemble predictors on input instances and
-//
-// computes the update to cached logits. It is designed to be used during training.
-// It traverses the trees starting from cached tree id and cached node id and
-// calculates the updates to be pushed to the cache.
-//
-// Arguments:
-//
-//	cached_tree_ids: Rank 1 Tensor containing cached tree ids which is the starting
-// tree of prediction.
-//	cached_node_ids: Rank 1 Tensor containing cached node id which is the starting
-// node of prediction.
-//	bucketized_features: A list of rank 1 Tensors containing bucket id for each
-// feature.
-//	logits_dimension: scalar, dimension of the logits, to be used for partial logits
-// shape.
-//
-// Returns Rank 2 Tensor containing logits update (with respect to cached
-// values stored) for each example.Rank 1 Tensor containing new tree ids for each example.Rank 1 Tensor containing new node ids in the new tree_ids.
-func BoostedTreesTrainingPredict(scope *Scope, tree_ensemble_handle tf.Output, cached_tree_ids tf.Output, cached_node_ids tf.Output, bucketized_features []tf.Output, logits_dimension int64) (partial_logits tf.Output, tree_ids tf.Output, node_ids tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"logits_dimension": logits_dimension}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesTrainingPredict",
-		Input: []tf.Input{
-			tree_ensemble_handle, cached_tree_ids, cached_node_ids, tf.OutputList(bucketized_features),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // Elementwise computes the bitwise AND of `x` and `y`.
 //
 // The result will have those bits set, that are set in both `x` and `y`. The
-- 
GitLab


From b941a031e8a2eb67e0083d8aa6ffe5a3ffe96f7b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 18:07:36 -0700
Subject: [PATCH 465/610] Pass checkpoint_path to predicate functions for
 experiment.continuous_eval even in the case of falsy eval_results

PiperOrigin-RevId: 199728382
---
 tensorflow/contrib/learn/python/learn/experiment.py      | 2 +-
 tensorflow/contrib/learn/python/learn/experiment_test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/learn/python/learn/experiment.py b/tensorflow/contrib/learn/python/learn/experiment.py
index 541da90617..f8a3709ee5 100644
--- a/tensorflow/contrib/learn/python/learn/experiment.py
+++ b/tensorflow/contrib/learn/python/learn/experiment.py
@@ -505,7 +505,7 @@ class Experiment(object):
     eval_result = None
     last_warning_time = 0
     while (not predicate_fn or predicate_fn(
-        eval_result, checkpoint_path=previous_path if eval_result else None)):
+        eval_result, checkpoint_path=previous_path)):
       # Exit if we have already reached number of steps to train.
       if self._has_training_stopped(eval_result):
         logging.info("Exiting continuous eval, global_step=%s >= "
diff --git a/tensorflow/contrib/learn/python/learn/experiment_test.py b/tensorflow/contrib/learn/python/learn/experiment_test.py
index d10927a0cd..fb16c94c29 100644
--- a/tensorflow/contrib/learn/python/learn/experiment_test.py
+++ b/tensorflow/contrib/learn/python/learn/experiment_test.py
@@ -500,7 +500,7 @@ class ExperimentTest(test.TestCase):
       noop_hook = _NoopHook()
 
       def _predicate_fn(eval_result, checkpoint_path):
-        self.assertEqual(not eval_result,
+        self.assertEqual(eval_result is None,
                          checkpoint_path is None)
         return est.eval_count < 3  # pylint: disable=cell-var-from-loop
 
-- 
GitLab


From 7b9c723c8f5f732f014ba181daf0b96747f291a9 Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Thu, 7 Jun 2018 18:19:32 -0700
Subject: [PATCH 466/610] Java: Release 1.9.0-rc0 (and update protbuf
 dependency)

PiperOrigin-RevId: 199729533
---
 tensorflow/java/maven/libtensorflow/pom.xml         | 2 +-
 tensorflow/java/maven/libtensorflow_jni/pom.xml     | 2 +-
 tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml | 2 +-
 tensorflow/java/maven/pom.xml                       | 2 +-
 tensorflow/java/maven/proto/pom.xml                 | 4 ++--
 tensorflow/java/maven/run_inside_container.sh       | 2 +-
 tensorflow/java/maven/tensorflow/pom.xml            | 2 +-
 7 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tensorflow/java/maven/libtensorflow/pom.xml b/tensorflow/java/maven/libtensorflow/pom.xml
index 08cc860f57..38e87b1639 100644
--- a/tensorflow/java/maven/libtensorflow/pom.xml
+++ b/tensorflow/java/maven/libtensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0</version>
+    <version>1.9.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni/pom.xml b/tensorflow/java/maven/libtensorflow_jni/pom.xml
index fcc7eacc33..36c984e280 100644
--- a/tensorflow/java/maven/libtensorflow_jni/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0</version>
+    <version>1.9.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni</artifactId>
diff --git a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
index 3d22d86a49..4c846de05a 100644
--- a/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
+++ b/tensorflow/java/maven/libtensorflow_jni_gpu/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0</version>
+    <version>1.9.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>libtensorflow_jni_gpu</artifactId>
diff --git a/tensorflow/java/maven/pom.xml b/tensorflow/java/maven/pom.xml
index 0a09a5ea7c..f2a0a97eae 100644
--- a/tensorflow/java/maven/pom.xml
+++ b/tensorflow/java/maven/pom.xml
@@ -6,7 +6,7 @@
   <modelVersion>4.0.0</modelVersion>
   <groupId>org.tensorflow</groupId>
   <artifactId>parentpom</artifactId>
-  <version>1.8.0</version>
+  <version>1.9.0-rc0</version>
   <packaging>pom</packaging>
 
   <url>https://www.tensorflow.org</url>
diff --git a/tensorflow/java/maven/proto/pom.xml b/tensorflow/java/maven/proto/pom.xml
index 77ec6a0ddb..eb0a952c7d 100644
--- a/tensorflow/java/maven/proto/pom.xml
+++ b/tensorflow/java/maven/proto/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0</version>
+    <version>1.9.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>proto</artifactId>
@@ -16,7 +16,7 @@
     <dependency>
       <groupId>com.google.protobuf</groupId>
       <artifactId>protobuf-java</artifactId>
-      <version>3.3.1</version>
+      <version>3.5.1</version>
     </dependency>
   </dependencies>
 
diff --git a/tensorflow/java/maven/run_inside_container.sh b/tensorflow/java/maven/run_inside_container.sh
index 6136ccfdfb..bf19c09b1d 100644
--- a/tensorflow/java/maven/run_inside_container.sh
+++ b/tensorflow/java/maven/run_inside_container.sh
@@ -31,7 +31,7 @@ if [[ "${TF_VERSION}" == *"-SNAPSHOT" ]]; then
   # Bintray does not allow snapshots.
   DEPLOY_BINTRAY="false"
 fi
-PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.3.0/protoc-3.3.0-linux-x86_64.zip"
+PROTOC_RELEASE_URL="https://github.com/google/protobuf/releases/download/v3.5.1/protoc-3.5.1-linux-x86_64.zip"
 if [[ "${DEPLOY_BINTRAY}" != "true" && "${DEPLOY_OSSRH}" != "true" ]]; then
   echo "Must deploy to at least one of Bintray or OSSRH" >&2
   exit 2
diff --git a/tensorflow/java/maven/tensorflow/pom.xml b/tensorflow/java/maven/tensorflow/pom.xml
index 0df1f28149..48668a47f2 100644
--- a/tensorflow/java/maven/tensorflow/pom.xml
+++ b/tensorflow/java/maven/tensorflow/pom.xml
@@ -6,7 +6,7 @@
   <parent>
     <groupId>org.tensorflow</groupId>
     <artifactId>parentpom</artifactId>
-    <version>1.8.0</version>
+    <version>1.9.0-rc0</version>
     <relativePath>../</relativePath>
   </parent>
   <artifactId>tensorflow</artifactId>
-- 
GitLab


From 2f41346cbc0c8ecb915983a1f8711fd0d0ccc50e Mon Sep 17 00:00:00 2001
From: Vinu Rajashekhar <vinuraja@google.com>
Date: Thu, 7 Jun 2018 18:21:25 -0700
Subject: [PATCH 467/610] Changes the batch_function decorator implementation
 to use the newly added BatchFunction op.

o Renames the previous version to batch_function_v1.

PiperOrigin-RevId: 199729701
---
 tensorflow/contrib/batching/__init__.py       |  1 +
 .../contrib/batching/python/ops/batch_ops.py  | 69 +++++++++++++++++++
 .../batching/python/ops/batch_ops_test.py     | 50 ++++++++++++++
 3 files changed, 120 insertions(+)

diff --git a/tensorflow/contrib/batching/__init__.py b/tensorflow/contrib/batching/__init__.py
index 44fa5f42a7..1e503a097a 100644
--- a/tensorflow/contrib/batching/__init__.py
+++ b/tensorflow/contrib/batching/__init__.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Ops and modules related to batch.
 
+@@batch_function_v1
 @@batch_function
 """
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index 921d6917a4..012a51f711 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
@@ -83,6 +84,74 @@ def batch_function(num_batch_threads,
   SparseTensor is not supported. The return value of the decorated function
   must be a Tensor or a list/tuple of Tensors.
 
+  Args:
+    num_batch_threads: Number of scheduling threads for processing batches
+     of work. Determines the number of batches processed in parallel.
+    max_batch_size: Batch sizes will never be bigger than this.
+    batch_timeout_micros: Maximum number of microseconds to wait before
+     outputting an incomplete batch.
+    allowed_batch_sizes: Optional list of allowed batch sizes. If left empty,
+     does nothing. Otherwise, supplies a list of batch sizes, causing the op
+     to pad batches up to one of those sizes. The entries must increase
+     monotonically, and the final entry must equal max_batch_size.
+    grad_timeout_micros: The timeout to use for the gradient. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+    unbatch_timeout_micros: The timeout to use for unbatching. See the
+     documentation of the unbatch op for more details. Defaults to 60s.
+    max_enqueued_batches: The maximum depth of the batch queue. Defaults to 10.
+
+  Returns:
+    The decorated function will return the unbatched computation output Tensors.
+  """
+
+  def decorator(fn):  # pylint: disable=missing-docstring
+
+    def decorated(*args):  # pylint: disable=missing-docstring
+      types = [arg.dtype for arg in args]
+
+      @function.Defun(*types)
+      def computation(*computation_args):
+        return fn(*computation_args)
+
+      with ops.name_scope("batch") as name:
+        for a in args:
+          if not isinstance(a, ops.Tensor):
+            raise ValueError("All arguments to functions decorated with "
+                             "`batch_function`  are supposed to be Tensors; "
+                             "found %s" % repr(a))
+        for inp in computation.captured_inputs:
+          print("inp: %s" % inp)
+          for op in inp.consumers():
+            print("op: %s" % op)
+        return gen_batch_ops.batch_function(
+            num_batch_threads=num_batch_threads,
+            max_batch_size=max_batch_size,
+            batch_timeout_micros=batch_timeout_micros,
+            allowed_batch_sizes=allowed_batch_sizes,
+            max_enqueued_batches=max_enqueued_batches,
+            shared_name=name,
+            f=computation,
+            in_tensors=list(args),
+            captured_tensors=computation.captured_inputs,
+            Tout=[o.type for o in computation.definition.signature.output_arg])
+
+    return decorated
+
+  return decorator
+
+
+def batch_function_v1(num_batch_threads,
+                      max_batch_size,
+                      batch_timeout_micros,
+                      allowed_batch_sizes=None,
+                      grad_timeout_micros=60 * 1000 * 1000,
+                      unbatch_timeout_micros=60 * 1000 * 1000,
+                      max_enqueued_batches=10):
+  """Batches the computation done by the decorated function.
+
+  This is the older version of batch_function(). Please use the former instead
+  of this.
+
   Args:
     num_batch_threads: Number of scheduling threads for processing batches
      of work. Determines the number of batches processed in parallel.
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index ea8339334f..7846814546 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -188,12 +188,62 @@ class BatchOpsTest(test.TestCase):
       self.assertEqual(thread_results[0], [2])
       self.assertEqual(main_results[0], [3])
 
+  def testBasicUnbatchV1Decorated(self):
+    """Tests that the batch_function_v1 decorator works."""
+    with self.test_session() as sess:
+      @batch_ops.batch_function_v1(1, 10, 100000)
+      def computation(in_t):
+        return in_t + 1
+
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
   def testBasicUnbatchDecorated(self):
     """Tests that the batch_function decorator works."""
     with self.test_session() as sess:
+      # TODO(apassos): Removing this line causes test flakiness! Ideally should
+      # be investigated.
+      default_inp = array_ops.placeholder_with_default(2, shape=[])  # pylint: disable=unused-variable
+
       @batch_ops.batch_function(1, 10, 100000)
       def computation(in_t):
         return in_t + 1
+
+      inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
+      result = computation(inp)
+      thread_results = []
+
+      def worker():
+        thread_results.extend(sess.run([result], feed_dict={inp: [1]}))
+
+      worker_thread = threading.Thread(target=worker)
+      worker_thread.start()
+      main_results = sess.run([result], feed_dict={inp: [2]})
+      worker_thread.join()
+      self.assertEqual(thread_results[0], [2])
+      self.assertEqual(main_results[0], [3])
+
+  def testBatchDecoratedWithCapturedInput(self):
+    """Tests that the batch_function decorator works."""
+    with self.test_session() as sess:
+      captured_inp0 = array_ops.placeholder_with_default(2, shape=[])
+      captured_inp1 = array_ops.placeholder_with_default(1, shape=[])
+
+      @batch_ops.batch_function(1, 10, 100000)
+      def computation(in_t):
+        return in_t + captured_inp0 - captured_inp1
+
       inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
       result = computation(inp)
       thread_results = []
-- 
GitLab


From a9ddfe50eee83b2f18293241ab96f0a1e2b4b05b Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Thu, 7 Jun 2018 18:42:30 -0700
Subject: [PATCH 468/610] [DataFlowAnalysis] Be less conservative on loop
 fusion nodes when reusing buffer.

- Previously, we say we cannot reuse operand buffer for a loop fusion node if any of the fusion's inputs is a broadcast or reshape. That's too conservative since in theory we can still reuse the operand's buffer if all the users of that particular operand are elementwise. This CL implements that.

- Also fixed a bug in previous code where a dynamic update fusion node that ends with convert (added for bf16) is not caught by the if condition currectly.

PiperOrigin-RevId: 199731488
---
 .../xla/service/hlo_dataflow_analysis.cc      |  31 +++--
 .../xla/service/hlo_dataflow_analysis_test.cc | 123 ++++++++++++++++++
 .../compiler/xla/service/hlo_instruction.cc   |  19 ++-
 .../xla/service/hlo_instruction_test.cc       |  17 +++
 tensorflow/compiler/xla/service/hlo_parser.cc |   3 +
 .../compiler/xla/service/hlo_parser_test.cc   |   2 +-
 6 files changed, 181 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index cc130a4900..d020005868 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -931,16 +931,17 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
     }
     const HloUse& use = value.uses()[0];
 
-    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-        user->fused_expression_root()->opcode() ==
-            HloOpcode::kDynamicUpdateSlice) {
-      // Loop fusion with kDynamicUpdateSlice fused root.
-      //
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root at operand
-      // index 0.
-      return use.instruction == user->fused_expression_root() &&
-             use.operand_number == 0;
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+      if (user->fused_expression_root()->opcode() ==
+          HloOpcode::kDynamicUpdateSlice) {
+        // Loop fusion with kDynamicUpdateSlice fused root.
+        //
+        // Returns true iff there is exactly one use of 'operand' at shape index
+        // 'operand_index', and this singleton use is the fused root at operand
+        // index 0.
+        return use.instruction == user->fused_expression_root() &&
+               use.operand_number == 0;
+      }
     } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
                user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
       // Output fusion with kAdd fused root.
@@ -967,6 +968,7 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
              use.operand_number == other_add_operand_index;
     }
   }
+
   if (user->opcode() == HloOpcode::kDynamicUpdateSlice ||
       user->opcode() == HloOpcode::kWhile) {
     // We eliminated other users in BufferLiveness::live_range_strictly_before,
@@ -998,8 +1000,13 @@ bool HloDataflowAnalysis::CanShareOperandBufferWithUser(
             }) != uses.end();
     return uses.size() == 2 && found_caller_use && found_elementwise_callee_use;
   }
-  // Check if 'user' is element-wise.
-  return user->IsElementwise();
+
+  // Loop fusions that contain transposing copies won't reach here as they have
+  // different layouts, which fails the check in the beginning of this function.
+  //
+  // Multi-output fusion will fail the check here as tuples are not considered
+  // an elementwise operation.
+  return user->IsElementwiseOnOperand(user->operand_index(operand));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 5798326dcb..db1822ec47 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -1974,6 +1974,89 @@ TEST_F(CanShareOperandBufferWithUserTest, ElementWiseSameShape) {
       dataflow_analysis_->CanShareOperandBufferWithUser(exp, {}, log, {}));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest,
+       NonElementwiseLoopFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, data_shape, "param0"));
+
+  auto neg = builder.AddInstruction(
+      HloInstruction::CreateUnary(data_shape, HloOpcode::kNegate, param0));
+
+  auto reverse = builder.AddInstruction(
+      HloInstruction::CreateReverse(data_shape, neg, {0, 1}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {reverse, neg}, HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                 fusion, {}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest,
+       MultiOutputFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  Shape in_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape out_shape = ShapeUtil::MakeShape(PRED, {8});
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, in_shape, "param0"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, in_shape, "param1"));
+
+  auto copy0 = builder.AddInstruction(
+      HloInstruction::CreateUnary(in_shape, HloOpcode::kCopy, param0));
+  auto copy1 = builder.AddInstruction(
+      HloInstruction::CreateUnary(in_shape, HloOpcode::kCopy, param1));
+
+  auto tuple =
+      builder.AddInstruction(HloInstruction::CreateTuple({copy1, copy0}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {tuple, copy1, copy0}, HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                 fusion, {0}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                 fusion, {1}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                 fusion, {0}));
+  EXPECT_FALSE(dataflow_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                 fusion, {1}));
+}
+
+TEST_F(CanShareOperandBufferWithUserTest,
+       ElementwiseLoopFusionCantAliasOperandBuffer) {
+  auto builder = HloComputation::Builder(TestName());
+  Shape data_shape = ShapeUtil::MakeShape(F32, {2, 2});
+
+  auto one = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(1.0)));
+  auto operand = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(data_shape, one, {1}));
+
+  auto neg = builder.AddInstruction(
+      HloInstruction::CreateUnary(data_shape, HloOpcode::kNegate, operand));
+
+  auto exp = builder.AddInstruction(
+      HloInstruction::CreateUnary(data_shape, HloOpcode::kExp, neg));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {exp, neg}, HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  EXPECT_TRUE(dataflow_analysis_->CanShareOperandBufferWithUser(operand, {},
+                                                                fusion, {}));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, ElementWiseDifferentShape) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2048,6 +2131,46 @@ TEST_F(CanShareOperandBufferWithUserTest, FusedDynamicUpdateSlice) {
                                                                 fusion, {}));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest,
+       FusedDynamicUpdateSliceWithConvertCantShare) {
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape data_shape = ShapeUtil::MakeShape(F32, {8});
+  Shape data_shape_bf16 = ShapeUtil::MakeShape(BF16, {8});
+  auto tuple = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({data_shape, data_shape}), "tuple"));
+  auto gte0 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 0));
+  auto gte1 = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(data_shape, tuple, 1));
+
+  auto convert1 = builder.AddInstruction(
+      HloInstruction::CreateConvert(data_shape_bf16, gte1));
+
+  // Create a DynamicUpdateSlice instruction of tuple element 1.
+  auto starts = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR1<int32>({2})));
+  auto update = builder.AddInstruction(HloInstruction::CreateConstant(
+      Literal::CreateR1<float>({2.f, 2.f, 2.f})));
+  auto dynamic_update_slice =
+      builder.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          data_shape_bf16, convert1, update, starts));
+
+  auto convert2 = builder.AddInstruction(
+      HloInstruction::CreateConvert(data_shape, dynamic_update_slice));
+  builder.AddInstruction(HloInstruction::CreateTuple({gte0, convert2}));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {convert2, dynamic_update_slice, starts, update, convert1},
+      HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  // The fusion instruction can't share with tuple element 1.
+  EXPECT_FALSE(
+      dataflow_analysis_->CanShareOperandBufferWithUser(gte1, {}, fusion, {}));
+}
+
 TEST_F(CanShareOperandBufferWithUserTest, DynamicUpdateSliceCanShare) {
   auto builder = HloComputation::Builder(TestName());
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index cf1530abe1..570ad5459a 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -398,6 +398,11 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     instruction->AppendOperand(operand);
   }
   instruction->called_computations_.push_back(map_computation);
+  // TODO(b/65689298) Remove code below once Map is generalized to accept
+  // arbitrary map dimensions.
+  instruction->dimensions_.resize(ShapeUtil::Rank(shape));
+  std::iota(instruction->dimensions_.begin(), instruction->dimensions_.end(),
+            0);
   return instruction;
 }
 
@@ -1603,7 +1608,7 @@ bool HloInstruction::HasLiteral() const { return literal_ != nullptr; }
 
 bool HloInstruction::CanHaveDimensionsField() const {
   return (opcode() == HloOpcode::kReverse ||
-          opcode() == HloOpcode::kConcatenate ||
+          opcode() == HloOpcode::kConcatenate || opcode() == HloOpcode::kMap ||
           opcode() == HloOpcode::kReduce || opcode() == HloOpcode::kBroadcast ||
           opcode() == HloOpcode::kTranspose);
 }
@@ -3151,7 +3156,19 @@ bool HloInstruction::IsElementwise() const {
 
     // Other operations.
     case HloOpcode::kRng:
+      return true;
     case HloOpcode::kMap:
+      if (!dimensions().empty()) {
+        // Check that the map is executed in elementwise compatible dimensions.
+        if (dimensions().size() != operand(0)->shape().dimensions_size()) {
+          return false;
+        }
+        for (int i = 0; i < dimensions().size(); ++i) {
+          if (dimensions()[i] != i) {
+            return false;
+          }
+        }
+      }
       return true;
     case HloOpcode::kFusion:
       if (fusion_kind() != FusionKind::kLoop) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 313033ddad..76349c4099 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -980,6 +980,23 @@ TEST_F(HloInstructionTest, FullyElementwise) {
   }
 }
 
+TEST_F(HloInstructionTest, MapIsElementwise) {
+  auto module = CreateNewModule();
+  const Shape r2f32 = ShapeUtil::MakeShapeWithLayout(F32, {10, 10}, {1, 0});
+  HloComputation::Builder builder(TestName());
+  HloComputation::Builder map_builder("id");
+  map_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  auto map_computation = module->AddEmbeddedComputation(map_builder.Build());
+  auto x =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r2f32, "x"));
+  auto map = builder.AddInstruction(
+      HloInstruction::CreateMap(r2f32, {x}, map_computation));
+  module->AddEntryComputation(builder.Build());
+
+  EXPECT_TRUE(map->IsElementwise());
+}
+
 TEST_F(HloInstructionTest, PartiallyElementwise) {
   const Shape r1f32 = ShapeUtil::MakeShape(F32, {5});
   const Shape r2f32 = ShapeUtil::MakeShape(F32, {3, 5});
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 3eadedfe1f..a1bc269400 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -777,6 +777,9 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
       optional<HloComputation*> to_apply;
       attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
                            &to_apply};
+      optional<std::vector<tensorflow::int64>> dimensions;
+      attrs["dimensions"] = {/*required=*/false, AttrTy::kBracedInt64List,
+                             &dimensions};
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
       }
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 08068dc504..1c5a47c875 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -765,7 +765,7 @@ add_F32.v3 {
 ENTRY MapBinaryAdder.v3 {
   param0 = f32[4]{0} parameter(0)
   param1 = f32[4]{0} parameter(1)
-  ROOT map = f32[4]{0} map(param0, param1), to_apply=add_F32.v3
+  ROOT map = f32[4]{0} map(param0, param1), dimensions={0}, to_apply=add_F32.v3
 }
 
 )"
-- 
GitLab


From 99e6a86480bfb518dea59b4b25f7c9549b227587 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 19:31:38 -0700
Subject: [PATCH 469/610] Implement Log operator.

PiperOrigin-RevId: 199735191
---
 tensorflow/contrib/lite/build_def.bzl         |  1 +
 tensorflow/contrib/lite/builtin_ops.h         |  1 +
 .../lite/g3doc/tf_ops_compatibility.md        | 11 ++++
 .../contrib/lite/kernels/elementwise.cc       | 23 ++++++--
 .../contrib/lite/kernels/elementwise_test.cc  | 18 +++++--
 tensorflow/contrib/lite/kernels/register.cc   |  2 +
 tensorflow/contrib/lite/model.cc              |  1 +
 tensorflow/contrib/lite/nnapi_delegate.cc     |  1 +
 tensorflow/contrib/lite/schema/schema.fbs     |  1 +
 .../contrib/lite/schema/schema_generated.h    |  9 ++--
 .../contrib/lite/testing/generate_examples.py | 54 ++++++++++++-------
 .../contrib/lite/toco/import_tensorflow.cc    |  2 +
 .../contrib/lite/toco/tflite/operator.cc      | 10 ++--
 .../contrib/lite/toco/tflite/operator_test.cc |  1 +
 14 files changed, 100 insertions(+), 35 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 13d9a463fb..30bb604d17 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -220,6 +220,7 @@ def generated_test_models():
         "less_equal",
         "local_response_norm",
         "log_softmax",
+        "log",
         "lstm",
         "max_pool",
         "maximum",
diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index 7b10b69f43..f3b2ac77fb 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -98,6 +98,7 @@ typedef enum {
   kTfLiteBuiltinExpandDims = 70,
   kTfLiteBuiltinEqual = 71,
   kTfLiteBuiltinNotEqual = 72,
+  kTfLiteBuiltinLog = 73,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
index 19145281fa..bb2e615eac 100644
--- a/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/contrib/lite/g3doc/tf_ops_compatibility.md
@@ -417,6 +417,17 @@ Outputs {
 }
 ```
 
+**LOG**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to log(input)
+}
+```
+
 **LOG_SOFTMAX**
 
 ```
diff --git a/tensorflow/contrib/lite/kernels/elementwise.cc b/tensorflow/contrib/lite/kernels/elementwise.cc
index 0bd5046950..98c21ce9d3 100644
--- a/tensorflow/contrib/lite/kernels/elementwise.cc
+++ b/tensorflow/contrib/lite/kernels/elementwise.cc
@@ -23,7 +23,7 @@ namespace ops {
 namespace builtin {
 namespace elementwise {
 
-TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
   const TfLiteTensor* input = GetInput(context, node, 0);
@@ -35,7 +35,8 @@ TfLiteStatus SinPrepare(TfLiteContext* context, TfLiteNode* node) {
                                TfLiteIntArrayCopy(input->dims));
 }
 
-TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+inline TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node,
+                         float float_func(float)) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   switch (input->type) {
@@ -44,7 +45,7 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
       const float* in = GetTensorData<float>(input);
       const float* in_end = in + elements;
       float* out = output->data.f;
-      for (; in < in_end; in++, out++) *out = std::sin(*in);
+      for (; in < in_end; in++, out++) *out = float_func(*in);
       return kTfLiteOk;
     }
     default: {
@@ -55,14 +56,28 @@ TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   }
 }
 
+TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
+  return Eval(context, node, std::sin);
+}
+
+TfLiteStatus LogEval(TfLiteContext* context, TfLiteNode* node) {
+  return Eval(context, node, std::log);
+}
+
 }  // namespace elementwise
 
 TfLiteRegistration* Register_SIN() {
-  static TfLiteRegistration r = {nullptr, nullptr, elementwise::SinPrepare,
+  static TfLiteRegistration r = {nullptr, nullptr, elementwise::GenericPrepare,
                                  elementwise::SinEval};
   return &r;
 }
 
+TfLiteRegistration* Register_LOG() {
+  static TfLiteRegistration r = {nullptr, nullptr, elementwise::GenericPrepare,
+                                 elementwise::LogEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/contrib/lite/kernels/elementwise_test.cc b/tensorflow/contrib/lite/kernels/elementwise_test.cc
index 412ffb04b9..10e88d5a31 100644
--- a/tensorflow/contrib/lite/kernels/elementwise_test.cc
+++ b/tensorflow/contrib/lite/kernels/elementwise_test.cc
@@ -24,12 +24,13 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
-class SinOpModel : public SingleOpModel {
+class ElementWiseOpModel : public SingleOpModel {
  public:
-  SinOpModel(std::initializer_list<int> input_shape) {
+  ElementWiseOpModel(BuiltinOperator op,
+                     std::initializer_list<int> input_shape) {
     input_ = AddInput(TensorType_FLOAT32);
     output_ = AddOutput(TensorType_FLOAT32);
-    SetBuiltinOp(BuiltinOperator_SIN, BuiltinOptions_NONE, 0);
+    SetBuiltinOp(op, BuiltinOptions_NONE, 0);
     BuildInterpreter({input_shape});
   }
 
@@ -42,7 +43,7 @@ class SinOpModel : public SingleOpModel {
 };
 
 TEST(ElementWise, Sin) {
-  SinOpModel m({1, 1, 4, 1});
+  ElementWiseOpModel m(BuiltinOperator_SIN, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
   m.Invoke();
   EXPECT_THAT(m.ExtractVector<float>(m.output()),
@@ -50,6 +51,15 @@ TEST(ElementWise, Sin) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(ElementWise, Log) {
+  ElementWiseOpModel m(BuiltinOperator_LOG, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {1, 3.1415926, 1, 1});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({0, 1.14473, 0, 0})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/kernels/register.cc b/tensorflow/contrib/lite/kernels/register.cc
index 6c68bb2f31..7bb28d4de7 100644
--- a/tensorflow/contrib/lite/kernels/register.cc
+++ b/tensorflow/contrib/lite/kernels/register.cc
@@ -73,6 +73,7 @@ TfLiteRegistration* Register_SQUEEZE();
 TfLiteRegistration* Register_STRIDED_SLICE();
 TfLiteRegistration* Register_EXP();
 TfLiteRegistration* Register_TOPK_V2();
+TfLiteRegistration* Register_LOG();
 TfLiteRegistration* Register_LOG_SOFTMAX();
 TfLiteRegistration* Register_CAST();
 TfLiteRegistration* Register_DEQUANTIZE();
@@ -150,6 +151,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
   AddBuiltin(BuiltinOperator_TOPK_V2, Register_TOPK_V2());
+  AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
   AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index d78b6eae90..4fb1ada9fd 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -357,6 +357,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FLOOR:
     case BuiltinOperator_NEG:
     case BuiltinOperator_SIN:
+    case BuiltinOperator_LOG:
       break;
     case BuiltinOperator_CAST: {
       TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 605ce7d6fc..99cb40e967 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -490,6 +490,7 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
       case tflite::BuiltinOperator_SELECT:
       case tflite::BuiltinOperator_SLICE:
       case tflite::BuiltinOperator_SIN:
+      case tflite::BuiltinOperator_LOG:
       case tflite::BuiltinOperator_TRANSPOSE_CONV:
       case tflite::BuiltinOperator_TILE:
       case tflite::BuiltinOperator_EXPAND_DIMS:
diff --git a/tensorflow/contrib/lite/schema/schema.fbs b/tensorflow/contrib/lite/schema/schema.fbs
index d12a96df1c..ee5208df14 100644
--- a/tensorflow/contrib/lite/schema/schema.fbs
+++ b/tensorflow/contrib/lite/schema/schema.fbs
@@ -150,6 +150,7 @@ enum BuiltinOperator : byte {
   EXPAND_DIMS = 70,
   EQUAL = 71,
   NOT_EQUAL = 72,
+  LOG = 73,
 }
 
 // Options for the builtin operators.
diff --git a/tensorflow/contrib/lite/schema/schema_generated.h b/tensorflow/contrib/lite/schema/schema_generated.h
index 8ddd2f1438..887e47ed1e 100755
--- a/tensorflow/contrib/lite/schema/schema_generated.h
+++ b/tensorflow/contrib/lite/schema/schema_generated.h
@@ -325,11 +325,12 @@ enum BuiltinOperator {
   BuiltinOperator_EXPAND_DIMS = 70,
   BuiltinOperator_EQUAL = 71,
   BuiltinOperator_NOT_EQUAL = 72,
+  BuiltinOperator_LOG = 73,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_NOT_EQUAL
+  BuiltinOperator_MAX = BuiltinOperator_LOG
 };
 
-inline BuiltinOperator (&EnumValuesBuiltinOperator())[72] {
+inline BuiltinOperator (&EnumValuesBuiltinOperator())[73] {
   static BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -402,7 +403,8 @@ inline BuiltinOperator (&EnumValuesBuiltinOperator())[72] {
     BuiltinOperator_TILE,
     BuiltinOperator_EXPAND_DIMS,
     BuiltinOperator_EQUAL,
-    BuiltinOperator_NOT_EQUAL
+    BuiltinOperator_NOT_EQUAL,
+    BuiltinOperator_LOG
   };
   return values;
 }
@@ -482,6 +484,7 @@ inline const char **EnumNamesBuiltinOperator() {
     "EXPAND_DIMS",
     "EQUAL",
     "NOT_EQUAL",
+    "LOG",
     nullptr
   };
   return names;
diff --git a/tensorflow/contrib/lite/testing/generate_examples.py b/tensorflow/contrib/lite/testing/generate_examples.py
index 723b6ae057..f5e25784fa 100644
--- a/tensorflow/contrib/lite/testing/generate_examples.py
+++ b/tensorflow/contrib/lite/testing/generate_examples.py
@@ -2420,30 +2420,44 @@ def make_neg_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
-def make_sin_tests(zip_path):
-  """Make a set of tests to do sin."""
+def _make_elementwise_tests(op):
+  """Make a set of tests to do element-wise operations."""
 
-  test_parameters = [{
-      "input_dtype": [tf.float32],
-      "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
-  }]
+  def f(zip_path):
+    """Actual function that generates examples."""
+    test_parameters = [{
+        "input_dtype": [tf.float32],
+        "input_shape": [[1], [1, 2], [5, 6, 7, 8], [3, 4, 5, 6]],
+    }]
 
-  def build_graph(parameters):
-    """Build the sin op testing graph."""
-    input_value = tf.placeholder(
-        dtype=parameters["input_dtype"],
-        name="input1",
-        shape=parameters["input_shape"])
-    out = tf.sin(input_value)
-    return [input_value], [out]
+    def build_graph(parameters):
+      """Build the sin op testing graph."""
+      input_value = tf.placeholder(
+          dtype=parameters["input_dtype"],
+          name="input1",
+          shape=parameters["input_shape"])
+      out = op(input_value)
+      return [input_value], [out]
 
-  def build_inputs(parameters, sess, inputs, outputs):
-    input_value = create_tensor_data(parameters["input_dtype"],
-                                     parameters["input_shape"])
-    return [input_value], sess.run(
-        outputs, feed_dict={inputs[0]: input_value})
+    def build_inputs(parameters, sess, inputs, outputs):
+      input_value = create_tensor_data(parameters["input_dtype"],
+                                       parameters["input_shape"])
+      return [input_value], sess.run(
+          outputs, feed_dict={inputs[0]: input_value})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+    make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+  return f
+
+
+def make_sin_tests(zip_path):
+  """Make a set of tests to do sin."""
+  return _make_elementwise_tests(tf.sin)(zip_path)
+
+
+def make_log_tests(zip_path):
+  """Make a set of tests to do log."""
+  return _make_elementwise_tests(tf.log)(zip_path)
 
 
 def make_where_tests(zip_path):
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 5cc999314c..8dd43dda3e 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1941,6 +1941,8 @@ Status ImportTensorFlowNode(const tensorflow::NodeDef& node,
     ConvertRandomUniform(node, tf_import_flags, model);
   } else if (node.op() == "Sin") {
     ConvertSimpleOperator<SinOperator, 1>(node, tf_import_flags, model);
+  } else if (node.op() == "Log") {
+    ConvertSimpleOperator<LogOperator, 1>(node, tf_import_flags, model);
   } else if (node.op() == "Select") {
     ConvertSimpleOperator<SelectOperator, 3>(node, tf_import_flags, model);
   } else if (node.op() == "SparseToDense") {
diff --git a/tensorflow/contrib/lite/toco/tflite/operator.cc b/tensorflow/contrib/lite/toco/tflite/operator.cc
index 8bfd76db6e..7490ab960b 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator.cc
@@ -1112,16 +1112,18 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList() {
       "LESS", OperatorType::kTensorFlowLess));
   ops.emplace_back(new SimpleOperator<TensorFlowLessEqualOperator>(
       "LESS_EQUAL", OperatorType::kTensorFlowLessEqual));
+  ops.emplace_back(new SimpleOperator<TensorFlowEqualOperator>(
+      "EQUAL", OperatorType::kTensorFlowEqual));
+  ops.emplace_back(new SimpleOperator<TensorFlowNotEqualOperator>(
+      "NOT_EQUAL", OperatorType::kTensorFlowNotEqual));
   ops.emplace_back(new SimpleOperator<NegOperator>("NEG", OperatorType::kNeg));
   ops.emplace_back(
       new SimpleOperator<SelectOperator>("SELECT", OperatorType::kSelect));
   ops.emplace_back(
       new SimpleOperator<SliceOperator>("SLICE", OperatorType::kSlice));
+  // Element-wise operator
   ops.emplace_back(new SimpleOperator<SinOperator>("SIN", OperatorType::kSin));
-  ops.emplace_back(new SimpleOperator<TensorFlowEqualOperator>(
-      "EQUAL", OperatorType::kTensorFlowEqual));
-  ops.emplace_back(new SimpleOperator<TensorFlowNotEqualOperator>(
-      "NOT_EQUAL", OperatorType::kTensorFlowNotEqual));
+  ops.emplace_back(new SimpleOperator<LogOperator>("LOG", OperatorType::kLog));
 
   return ops;
 }
diff --git a/tensorflow/contrib/lite/toco/tflite/operator_test.cc b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
index 06bbe53516..e3144ad63e 100644
--- a/tensorflow/contrib/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/contrib/lite/toco/tflite/operator_test.cc
@@ -123,6 +123,7 @@ TEST_F(OperatorTest, SimpleOperators) {
                                                OperatorType::kTensorFlowEqual);
   CheckSimpleOperator<TensorFlowNotEqualOperator>(
       "NOT_EQUAL", OperatorType::kTensorFlowNotEqual);
+  CheckSimpleOperator<LogOperator>("LOG", OperatorType::kLog);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
-- 
GitLab


From a58cdd23d5bd5909b14bddade7ddbf9b6573fc69 Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Thu, 7 Jun 2018 19:55:07 -0700
Subject: [PATCH 470/610] Replace add_variable() with add_weight() in official
 keras layers.

Make it easier for analysis and code search.

PiperOrigin-RevId: 199736646
---
 .../python/keras/layers/convolutional.py      | 83 ++++++++++---------
 tensorflow/python/keras/layers/core.py        | 30 +++----
 .../python/keras/layers/normalization.py      |  6 +-
 3 files changed, 63 insertions(+), 56 deletions(-)

diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index ce1c84e98d..9ea341139e 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -151,21 +151,23 @@ class Conv(Layer):
     input_dim = int(input_shape[channel_axis])
     kernel_shape = self.kernel_size + (input_dim, self.filters)
 
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
+    self.kernel = self.add_weight(
+        name='kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        trainable=True,
+        dtype=self.dtype)
     if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
+      self.bias = self.add_weight(
+          name='bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
     else:
       self.bias = None
     self.input_spec = InputSpec(ndim=self.rank + 2,
@@ -720,21 +722,23 @@ class Conv2DTranspose(Conv2D):
     self.input_spec = InputSpec(ndim=4, axes={channel_axis: input_dim})
     kernel_shape = self.kernel_size + (self.filters, input_dim)
 
-    self.kernel = self.add_variable(name='kernel',
-                                    shape=kernel_shape,
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
+    self.kernel = self.add_weight(
+        name='kernel',
+        shape=kernel_shape,
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        trainable=True,
+        dtype=self.dtype)
     if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
+      self.bias = self.add_weight(
+          name='bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
     else:
       self.bias = None
     self.built = True
@@ -961,7 +965,7 @@ class Conv3DTranspose(Conv3D):
     kernel_shape = self.kernel_size + (self.filters, input_dim)
     self.input_spec = InputSpec(ndim=5, axes={channel_axis: input_dim})
 
-    self.kernel = self.add_variable(
+    self.kernel = self.add_weight(
         'kernel',
         shape=kernel_shape,
         initializer=self.kernel_initializer,
@@ -970,7 +974,7 @@ class Conv3DTranspose(Conv3D):
         trainable=True,
         dtype=self.dtype)
     if self.use_bias:
-      self.bias = self.add_variable(
+      self.bias = self.add_weight(
           'bias',
           shape=(self.filters,),
           initializer=self.bias_initializer,
@@ -1222,7 +1226,7 @@ class SeparableConv(Conv):
     pointwise_kernel_shape = (
         1,) * self.rank + (self.depth_multiplier * input_dim, self.filters)
 
-    self.depthwise_kernel = self.add_variable(
+    self.depthwise_kernel = self.add_weight(
         name='depthwise_kernel',
         shape=depthwise_kernel_shape,
         initializer=self.depthwise_initializer,
@@ -1230,7 +1234,7 @@ class SeparableConv(Conv):
         constraint=self.depthwise_constraint,
         trainable=True,
         dtype=self.dtype)
-    self.pointwise_kernel = self.add_variable(
+    self.pointwise_kernel = self.add_weight(
         name='pointwise_kernel',
         shape=pointwise_kernel_shape,
         initializer=self.pointwise_initializer,
@@ -1239,13 +1243,14 @@ class SeparableConv(Conv):
         trainable=True,
         dtype=self.dtype)
     if self.use_bias:
-      self.bias = self.add_variable(name='bias',
-                                    shape=(self.filters,),
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    trainable=True,
-                                    dtype=self.dtype)
+      self.bias = self.add_weight(
+          name='bias',
+          shape=(self.filters,),
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          trainable=True,
+          dtype=self.dtype)
     else:
       self.bias = None
     self.built = True
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index df4c3915a3..5061825d38 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -882,21 +882,23 @@ class Dense(Layer):
                        'should be defined. Found `None`.')
     self.input_spec = InputSpec(min_ndim=2,
                                 axes={-1: input_shape[-1].value})
-    self.kernel = self.add_variable('kernel',
-                                    shape=[input_shape[-1].value, self.units],
-                                    initializer=self.kernel_initializer,
-                                    regularizer=self.kernel_regularizer,
-                                    constraint=self.kernel_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
+    self.kernel = self.add_weight(
+        'kernel',
+        shape=[input_shape[-1].value, self.units],
+        initializer=self.kernel_initializer,
+        regularizer=self.kernel_regularizer,
+        constraint=self.kernel_constraint,
+        dtype=self.dtype,
+        trainable=True)
     if self.use_bias:
-      self.bias = self.add_variable('bias',
-                                    shape=[self.units,],
-                                    initializer=self.bias_initializer,
-                                    regularizer=self.bias_regularizer,
-                                    constraint=self.bias_constraint,
-                                    dtype=self.dtype,
-                                    trainable=True)
+      self.bias = self.add_weight(
+          'bias',
+          shape=[self.units,],
+          initializer=self.bias_initializer,
+          regularizer=self.bias_regularizer,
+          constraint=self.bias_constraint,
+          dtype=self.dtype,
+          trainable=True)
     else:
       self.bias = None
     self.built = True
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 7743d00c0f..ff51eadee9 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -183,7 +183,7 @@ class BatchNormalization(Layer):
   def _add_tower_local_variable(self, *args, **kwargs):
     tower_context = distribute_lib.get_tower_context()
     with tower_context.tower_local_var_scope('mean'):
-      return self.add_variable(*args, **kwargs)
+      return self.add_weight(*args, **kwargs)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -276,7 +276,7 @@ class BatchNormalization(Layer):
           self.axis[idx] = x + 1      # Account for added dimension
 
     if self.scale:
-      self.gamma = self.add_variable(
+      self.gamma = self.add_weight(
           name='gamma',
           shape=param_shape,
           dtype=param_dtype,
@@ -291,7 +291,7 @@ class BatchNormalization(Layer):
             1.0, dtype=param_dtype, shape=param_shape)
 
     if self.center:
-      self.beta = self.add_variable(
+      self.beta = self.add_weight(
           name='beta',
           shape=param_shape,
           dtype=param_dtype,
-- 
GitLab


From 88d52c145b7fab581bc97a9ce99514e149c558dc Mon Sep 17 00:00:00 2001
From: Bixia Zheng <bixia@google.com>
Date: Thu, 7 Jun 2018 21:22:55 -0700
Subject: [PATCH 471/610] Enhance row reduction implementation.

The current implementation tiles the x-dimension of the tensors to calculate the
partial results of the reduction. This change increases such an x-tile size from
8 to 64 if doing so results in all saturated tiles. Otherwise, this change adds
z-dimension tiles to increase the number of elements that each thread reduces to
a partial result to reduce the number of needed dynamic atomic operations and
intra-warp reduction operations.

Use a tighter yet safe loop bound for the last unsaturated tile.

Avoid generating the atomic operation when the tile size is not smaller than the
reduction width.

Extend ForLoop emitter to support a request for fully loop unrolling.

Add three tests.

PiperOrigin-RevId: 199744209
---
 .../xla/service/cpu/dot_op_emitter.cc         | 169 +++++-----
 tensorflow/compiler/xla/service/gpu/BUILD     |   1 +
 .../xla/service/gpu/ir_emitter_unnested.cc    | 316 +++++++++++-------
 .../service/llvm_ir/kernel_support_library.cc |  48 +--
 .../service/llvm_ir/kernel_support_library.h  | 175 +++++++---
 .../compiler/xla/service/llvm_ir/llvm_loop.cc |  33 +-
 .../compiler/xla/service/llvm_ir/llvm_loop.h  |  59 ++--
 7 files changed, 499 insertions(+), 302 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
index fe4ba2a070..8eb39d615f 100644
--- a/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/dot_op_emitter.cc
@@ -324,11 +324,11 @@ void ColumnMajorMatrixVectorProductEmitter::Emit() {
   int64 column_remainder = k() % tile_cols();
   int64 column_limit = k() - column_remainder;
 
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
-           [&](llvm::Value* column, bool is_first_column) {
-             EmitOuterLoopBody(column, tile_cols(), is_first_column);
-           });
+  ksl_.ForReturnVoid("dot.outer.tiled",
+                     /*start=*/0, /*end=*/column_limit, /*step=*/tile_cols(),
+                     [&](llvm::Value* column, bool is_first_column) {
+                       EmitOuterLoopBody(column, tile_cols(), is_first_column);
+                     });
 
   if (column_remainder != 0) {
     EmitOuterLoopBody(ir_builder_->getInt64(column_limit), column_remainder,
@@ -341,19 +341,20 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
     int64 columns, bool is_first_column) {
   int64 row_limit = m() - (m() % tile_rows());
 
-  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
-           /*step=*/tile_rows(), [&](llvm::Value* row) {
-             std::vector<llvm::Value*> lhs_tile =
-                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
-             llvm::Value* accumulator =
-                 is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
-                                            : vsl_.GetZeroVector())
-                                 : vsl_.LoadVector(result_, row);
-             for (int i = 0; i < columns; i++) {
-               accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
-             }
-             vsl_.StoreVector(accumulator, result_, row);
-           });
+  ksl_.ForReturnVoid(
+      "dot.inner.tiled", /*start=*/0, /*end=*/row_limit,
+      /*step=*/tile_rows(), [&](llvm::Value* row) {
+        std::vector<llvm::Value*> lhs_tile =
+            lhs_memory_tile->LoadTile(/*minor_dim_offset=*/row);
+        llvm::Value* accumulator =
+            is_first_column ? (addend_ ? vsl_.LoadVector(addend_, row)
+                                       : vsl_.GetZeroVector())
+                            : vsl_.LoadVector(result_, row);
+        for (int i = 0; i < columns; i++) {
+          accumulator = vsl_.MulAdd(lhs_tile[i], rhs_tile[i], accumulator);
+        }
+        vsl_.StoreVector(accumulator, result_, row);
+      });
 }
 
 void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
@@ -372,7 +373,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
   //     // initialized.
   //   }
 
-  ksl_.For(
+  ksl_.ForReturnVoid(
       "dot.inner.epilg.outer", /*start=*/current_tile_col,
       /*end=*/ir_builder_->CreateAdd(columns_llvm, current_tile_col),
       /*step=*/1, /*peel_first_iteration=*/false,
@@ -382,7 +383,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
             ir_builder_->CreateMul(col, ir_builder_->getInt64(m()));
         llvm::Value* lhs_base_pointer =
             vsl_.ComputeOffsetPointer(lhs_, total_offset);
-        ksl_.For(
+        ksl_.ForReturnVoid(
             "dot.inner.epilg.inner", /*start=*/row_start, /*end=*/m(),
             /*step=*/1, [&](llvm::Value* scalar_row) {
               llvm::Value* product = vsl_.Mul(
@@ -390,7 +391,7 @@ void ColumnMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
               llvm::Value* setting_result_first_time = ir_builder_->CreateAnd(
                   is_first_scalar_col,
                   ir_builder_->getInt1(is_first_tiled_column));
-              ksl_.If(
+              ksl_.IfReturnVoid(
                   setting_result_first_time,
                   /*true_block_generator=*/
                   [&]() {
@@ -571,9 +572,10 @@ void RowMajorMatrixVectorProductEmitter::Emit() {
   int64 row_remainder = m() % tile_rows();
   int64 row_limit = m() - row_remainder;
 
-  ksl_.For("dot.outer.tiled",
-           /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
-           [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
+  ksl_.ForReturnVoid(
+      "dot.outer.tiled",
+      /*start=*/0, /*end=*/row_limit, /*step=*/tile_rows(),
+      [&](llvm::Value* row) { EmitOuterLoopBody(row, tile_rows()); });
 
   if (row_remainder != 0) {
     EmitOuterLoopBody(ir_builder_->getInt64(row_limit), row_remainder);
@@ -585,17 +587,17 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopTiled(
     std::vector<VectorVariable>* vector_accumulators) {
   int64 column_limit = k() - (k() % tile_cols());
 
-  ksl_.For("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
-           /*step=*/tile_cols(), [&](llvm::Value* col) {
-             std::vector<llvm::Value*> lhs_tile =
-                 lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
-             llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
-             for (int i = 0; i < rows; i++) {
-               llvm::Value* old_sum = (*vector_accumulators)[i].Get();
-               (*vector_accumulators)[i].Set(
-                   vsl_.Add(old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
-             }
-           });
+  ksl_.ForReturnVoid("dot.inner.tiled", /*start=*/0, /*end=*/column_limit,
+                     /*step=*/tile_cols(), [&](llvm::Value* col) {
+                       std::vector<llvm::Value*> lhs_tile =
+                           lhs_memory_tile->LoadTile(/*minor_dim_offset=*/col);
+                       llvm::Value* rhs_value = vsl_.LoadVector(rhs_, col);
+                       for (int i = 0; i < rows; i++) {
+                         llvm::Value* old_sum = (*vector_accumulators)[i].Get();
+                         (*vector_accumulators)[i].Set(vsl_.Add(
+                             old_sum, vsl_.Mul(rhs_value, lhs_tile[i])));
+                       }
+                     });
 }
 
 void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
@@ -612,14 +614,15 @@ void RowMajorMatrixVectorProductEmitter::EmitInnerLoopEpilogue(
         ir_builder_->getInt64(k()));
     llvm::Value* lhs_base_pointer =
         vsl_.ComputeOffsetPointer(lhs_, total_offset);
-    ksl_.For("dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
-             /*step=*/1, [&](llvm::Value* scalar_col) {
-               llvm::Value* product =
-                   vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
-                            vsl_.LoadScalar(rhs_, scalar_col));
-               llvm::Value* old_value = (*scalar_accumulators)[r].Get();
-               (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
-             });
+    ksl_.ForReturnVoid(
+        "dot.inner.epilg.inner", /*start=*/column_start, /*end=*/k(),
+        /*step=*/1, [&](llvm::Value* scalar_col) {
+          llvm::Value* product =
+              vsl_.Mul(vsl_.LoadScalar(lhs_base_pointer, scalar_col),
+                       vsl_.LoadScalar(rhs_, scalar_col));
+          llvm::Value* old_value = (*scalar_accumulators)[r].Get();
+          (*scalar_accumulators)[r].Set(vsl_.Add(old_value, product));
+        });
   }
 }
 
@@ -817,7 +820,7 @@ void MatrixMatrixBlockPanelEmitter::HandleResiduesOnN() {
 
   if (n_start != dims().n()) {
     VectorSupportLibrary vsl(scalar_type(), 1, ir_builder_, "gebp");
-    ksl_.For("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
+    ksl_.ForReturnVoid("epi.n", n_start, dims().n(), 1, [&](llvm::Value* n_i) {
       llvm::Value* n_i_next =
           ir_builder_->CreateAdd(n_i, ir_builder_->getInt64(1));
       HandleResiduesOnK(&vsl, n_i, n_i_next);
@@ -929,39 +932,44 @@ void MatrixMatrixBlockPanelEmitter::EmitTiledGemm(
     VectorSupportLibrary* vsl, int64 tile_size_k, llvm::Value* k_start,
     llvm::Value* k_end, llvm::Value* n_start, llvm::Value* n_end,
     int64 tile_size_m, llvm::Value* m_start, llvm::Value* m_end) {
-  ksl_.For("dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
-    MemoryTile result_memory_tile(vsl, ir_builder_, /*matrix=*/result_,
-                                  /*matrix_size_along_minor_dim=*/dims().n(),
-                                  /*major_dim_offset=*/m_i,
-                                  /*tile_size_along_major_dim=*/tile_size_m);
-    MemoryTile lhs_memory_tile(vsl, ir_builder_, /*matrix=*/lhs_,
-                               /*matrix_size_along_minor_dim=*/dims().k(),
-                               /*major_dim_offset=*/m_i,
-                               /*tile_size_along_major_dim=*/tile_size_m);
-
-    ksl_.For(
-        "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
-          TileVariable result_tile_var(vsl, result_memory_tile.LoadTile(n_i));
-          ksl_.For("dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
-            MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_, dims().n(), k_i,
-                                       tile_size_k);
-            std::vector<std::vector<llvm::Value*>> lhs_tile =
-                lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
-            std::vector<llvm::Value*> rhs_tile = rhs_memory_tile.LoadTile(n_i);
-            std::vector<llvm::Value*> result_tile = result_tile_var.Get();
-            for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
-              for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
-                result_tile[r_m_i] =
-                    vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
-                                result_tile[r_m_i]);
-              }
-            }
-            result_tile_var.Set(result_tile);
-          });
-
-          result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
-        });
-  });
+  ksl_.ForReturnVoid(
+      "dot.m", m_start, m_end, tile_size_m, [&](llvm::Value* m_i) {
+        MemoryTile result_memory_tile(
+            vsl, ir_builder_, /*matrix=*/result_,
+            /*matrix_size_along_minor_dim=*/dims().n(),
+            /*major_dim_offset=*/m_i,
+            /*tile_size_along_major_dim=*/tile_size_m);
+        MemoryTile lhs_memory_tile(vsl, ir_builder_, /*matrix=*/lhs_,
+                                   /*matrix_size_along_minor_dim=*/dims().k(),
+                                   /*major_dim_offset=*/m_i,
+                                   /*tile_size_along_major_dim=*/tile_size_m);
+        ksl_.ForReturnVoid(
+            "dot.n", n_start, n_end, vsl->vector_size(), [&](llvm::Value* n_i) {
+              TileVariable result_tile_var(vsl,
+                                           result_memory_tile.LoadTile(n_i));
+              ksl_.ForReturnVoid(
+                  "dot.k", k_start, k_end, tile_size_k, [&](llvm::Value* k_i) {
+                    MemoryTile rhs_memory_tile(vsl, ir_builder_, rhs_,
+                                               dims().n(), k_i, tile_size_k);
+                    std::vector<std::vector<llvm::Value*>> lhs_tile =
+                        lhs_memory_tile.LoadBroadcastTile(k_i, tile_size_k);
+                    std::vector<llvm::Value*> rhs_tile =
+                        rhs_memory_tile.LoadTile(n_i);
+                    std::vector<llvm::Value*> result_tile =
+                        result_tile_var.Get();
+                    for (int64 r_m_i = 0; r_m_i < tile_size_m; r_m_i++) {
+                      for (int64 r_k_i = 0; r_k_i < tile_size_k; r_k_i++) {
+                        result_tile[r_m_i] =
+                            vsl->MulAdd(lhs_tile[r_m_i][r_k_i], rhs_tile[r_k_i],
+                                        result_tile[r_m_i]);
+                      }
+                    }
+                    result_tile_var.Set(result_tile);
+                  });
+
+              result_memory_tile.StoreTile(result_tile_var.Get(), n_i);
+            });
+      });
 }
 
 }  // namespace
@@ -1293,8 +1301,11 @@ Status DotOpEmitter::Emit() {
   // from messing up the vectorization.
   std::unique_ptr<llvm_ir::ForLoop> reduction_loop = loop_nest.AddLoop(
       0, lhs_shape.dimensions(lhs_reduction_dimension), "reduction",
-      /*prevent_unrolling=*/lhs_reduction_along_minor_dimension &&
-          rhs_reduction_along_minor_dimension);
+      /*unroll_mode=*/
+      (lhs_reduction_along_minor_dimension &&
+       rhs_reduction_along_minor_dimension)
+          ? xla::llvm_ir::UnrollMode::kNoUnroll
+          : xla::llvm_ir::UnrollMode::kDefaultUnroll);
 
   // The final entry in the rhs and lhs indexes is the indvar of the
   // reduction loop.
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 6bd9d4c31d..5e5ca7c72c 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -164,6 +164,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:name_uniquer",
         "//tensorflow/compiler/xla/service/llvm_ir:fused_ir_emitter",
         "//tensorflow/compiler/xla/service/llvm_ir:ir_array",
+        "//tensorflow/compiler/xla/service/llvm_ir:kernel_support_library",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_loop",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/compiler/xla/service/llvm_ir:loop_emitter",
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index ed005f6afc..a3c1c06cbc 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -59,6 +59,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/ops.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
@@ -1391,6 +1392,30 @@ Status IrEmitterUnnested::EmitColumnReduction(
       .EmitLoop(IrName(reduce));
 }
 
+static std::pair<int64, int64> ComputeTilingSchemeForReduction(
+    int64 depth, int64 width, int64 kWarpSize) {
+  constexpr int64 kTargetNumElementsPerThread = 64;
+  int64 x_tile_size = kTargetNumElementsPerThread;
+  int64 z_tile_size = 1;
+
+  // Only tile along the x dimension with tile size kTargetNumElementsPerThread
+  // if doing so doesn't require a slow version of loop with bound check on each
+  // dimension. A more sophisticated heuristics is to enable tile along the
+  // x dimension with tile size kTargetNumElementsPerThread when either width is
+  // a factor of (kWarpSize * kTargetNumElementsPerThread) or width is big
+  // enough so that only a small fraction of the threads execute the slow
+  // version of loop with bound check.
+  if (width % (kWarpSize * kTargetNumElementsPerThread) != 0) {
+    x_tile_size = 8;
+    z_tile_size = 8;
+    while (depth % z_tile_size != 0) {
+      z_tile_size -= 1;
+    }
+  }
+
+  return std::pair<int64, int64>(x_tile_size, z_tile_size);
+}
+
 Status IrEmitterUnnested::EmitRowReduction(
     int64 depth, int64 height, int64 width, HloInstruction* reduce,
     const Shape& input_shape,
@@ -1402,7 +1427,7 @@ Status IrEmitterUnnested::EmitRowReduction(
         std::pair<llvm_ir::ElementGenerator, ShapeIndex>>
         extra_output_gens) {
   // A naive algorithm is:
-  // 1. Divide the input tensor into tiles of size 1x1xK.
+  // 1. Divide the x dimension of the input tensor into tiles of size 1x1xX.
   // 2. Partially reduces each tile to a scalar using one thread.
   // 3. Accumulates that scalar to the output vector using atomic operations.
   //
@@ -1413,15 +1438,15 @@ Status IrEmitterUnnested::EmitRowReduction(
   //   int y = linear_index / width_in_tiles % height;
   //   int z = linear_index / (height * width_in_tiles);
   //   float partial_result = 0;
-  //   for (element_id_in_tile : range(kTileSize)) {
-  //     int x = x_in_tiles * kTileSize + element_id_in_tile;
+  //   for (element_id_in_tile : range(x_tile_size)) {
+  //     int x = x_in_tiles * x_tile_size + element_id_in_tile;
   //     if (x < width)
   //       partial_result = reducer(partial_result, input[z][y][z]);
   //   }
   //   AtomicReducer(&output[y], partial_result);
   // }
   //
-  // Three optimizations are performed.
+  // Four optimizations are performed.
   //
   // 1. To coalesce global memory accesses, dilate the tile with a factor of 32
   // (i.e. the warp size). For example, suppose the width is 8x32=256. Instead
@@ -1448,29 +1473,44 @@ Status IrEmitterUnnested::EmitRowReduction(
   // element_id_in_tile, which makes the code more friendly to optimizations
   // such as LICM.
   //
+  // 4. When the width is too small and x_tile_size is less than the target
+  //    number of elements per thread and use a small factor of depth as
+  //    z_tile_size to increase the number of elements calculated by each
+  //    partial sum. This can reduce the needed number of dynamic shfl_down and
+  //    atomic operations.
+  //
   // for (linear_index = threadIdx.x + blockIdx.x * blockDim.x;
   //      linear_index < depth * height * width_in_tiles;
   //      linear_index += blockDim.x * gridDim.x) {
   //   int x_in_tiles = linear_index % width_in_tiles;
   //   int y = linear_index / width_in_tiles % height;
-  //   int z = linear_index / (height * width_in_tiles);
+  //   int z_in_tiles = linear_index / (height * width_in_tiles);
   //   int warp_id = x_in_tiles / warpSize;
   //   int lane_id = x_in_tiles % warpSize;
   //   float partial_result = 0;
   //   int x = warp_id * kTileSize * warpSize + lane_id;
-  //   if (width % (kTileSize * warpSize) == 0 ||
-  //       x + (kTileSize - 1) * warpSize < width) {
-  //     // The entire tile is in bounds.
-  //     for (int element_id_in_tile = 0; element_id_in_tile < kTileSize;
-  //        ++element_id_in_tile, x += warpSize) {
-  //       partial_result = Reducer(partial_result, input[z][y][x]);
+  //   if (width % (x_tile_size * warpSize) == 0 ||
+  //       x + (x_tile_size - 1) * warpSize < width) {
+  //     // The entire x_tile is in bounds.
+  //     for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size;
+  //        ++element_id_in_z_tile) {
+  //       z = z_in_tiles * z_tile_size + element_id_in_z_tile;
+  //       for (int element_id_in_x_tile = 0;element_id_in_x_tile < x_tile_size;
+  //        ++element_id_in_x_tile, x += warpSize) {
+  //         partial_result = Reducer(partial_result, input[z][y][x]);
+  //       }
   //     }
   //   } else {
   //     // The tile is partially in bounds.
-  //     for (int element_id_in_tile = 0; element_id_in_tile < kTileSize;
+  //     for (int element_id_in_z_tile = 0; element_id_in_z_tile < z_tile_size;
+  //        ++element_id_in_z_tile) {
+  //       z = z_in_tiles * z_tile_size + element_id_in_z_tile;
+  //       for (int element_id_in_x_tile = 0; element_id_in_x_tile <
+  //       x_tile_size;
   //          ++element_id_in_tile, x += warpSize) {
-  //       if (x < width)
-  //         partial_result = Reducer(partial_result, input[z][y][x]);
+  //         if (x < width)
+  //           partial_result = Reducer(partial_result, input[z][y][x]);
+  //       }
   //     }
   //   }
   //   for (shuffle_distance = 16; shuffle_distance > 0; shuffle_distance /= 2)
@@ -1481,17 +1521,20 @@ Status IrEmitterUnnested::EmitRowReduction(
   //     AtomicReducer(&output[y], partial_result);
   // }
   //
-  // Choose 8 as the tile size, which matches Eigen's RowReduceKernel.
-  constexpr int64 kTileSize = 8;
+
+  int64 x_tile_size;
+  int64 z_tile_size;
+  std::tie(x_tile_size, z_tile_size) =
+      ComputeTilingSchemeForReduction(depth, width, kWarpSize);
+
   // Round the width in tiles up to the nearest multiple of kWarpSize, so that
   // the use of shfl_down is valid.
   const int64 width_in_tiles =
-      RoundUpToNearest(CeilOfRatio(width, kTileSize), kWarpSize);
+      RoundUpToNearest(CeilOfRatio(width, x_tile_size), kWarpSize);
 
-  auto loop_body_emitter =
-      [=](const llvm_ir::IrArray::Index& tile_index) -> Status {
+  auto loop_body_emitter = [=](const llvm_ir::IrArray::Index& tile_index) {
+    // Emit the loop body that reduces one z-x-tile.
     const int num_reduces = reducers.size();
-    // Emit the loop body that reduces one tile.
     llvm::Type* element_ir_type = llvm_ir::PrimitiveTypeToIrType(
         input_shape.element_type(), ir_emitter_context_->llvm_module());
     std::vector<llvm::Value*> partial_reduction_result_addresses;
@@ -1506,9 +1549,7 @@ Status IrEmitterUnnested::EmitRowReduction(
           partial_reduction_result_address);
     }
 
-    // Emit an inner for-loop that partially reduces the elements in the given
-    // tile.
-    llvm::Value* z = tile_index[0];
+    llvm::Value* z_tile = tile_index[0];
     llvm::Value* y = tile_index[1];
     llvm::Value* x_tile = tile_index[2];
     llvm::Value* warp_id = ir_builder_.CreateUDiv(
@@ -1516,107 +1557,132 @@ Status IrEmitterUnnested::EmitRowReduction(
     llvm::Value* lane_id = ir_builder_.CreateURem(
         x_tile, ir_builder_.getInt64(kWarpSize), "lane_id");
 
-    // The x-location of the last element in this tile.
-    //   last_x = lane_id + warpSize * (kTileSize - 1 + warp_id * kTileSize);
+    // The x-location of the last element in this z-x-tile.
+    //   last_x = lane_id + warpSize * (x_tile_size - 1 + warp_id *
+    //   x_tile_size);
     llvm::Value* last_x = ir_builder_.CreateNSWAdd(
-        lane_id,
-        ir_builder_.CreateNSWMul(
-            ir_builder_.getInt64(kWarpSize),
-            ir_builder_.CreateNSWAdd(
-                ir_builder_.getInt64(kTileSize - 1),
-                ir_builder_.CreateNSWMul(warp_id,
-                                         ir_builder_.getInt64(kTileSize)))));
-
-    auto emit_tile_element_loop = [=](bool tile_in_bounds) -> Status {
-      std::unique_ptr<llvm_ir::ForLoop> tile_element_loop =
-          llvm_ir::ForLoop::EmitForLoop("element_id_in_tile",
-                                        ir_builder_.getInt64(0),
-                                        ir_builder_.getInt64(kTileSize),
-                                        ir_builder_.getInt64(1), &ir_builder_);
-
-      // Emit the body of the partial reduction loop.
-      llvm_ir::SetToFirstInsertPoint(tile_element_loop->GetBodyBasicBlock(),
-                                     &ir_builder_);
-      // x = lane_id + warpSize * (element_id_in_tile + warp_id * kTileSize);
-      llvm::Value* x = ir_builder_.CreateNSWAdd(
-          lane_id,
-          ir_builder_.CreateNSWMul(
-              ir_builder_.getInt64(kWarpSize),
-              ir_builder_.CreateNSWAdd(
-                  tile_element_loop->GetIndVarValue(),
-                  ir_builder_.CreateNSWMul(warp_id,
-                                           ir_builder_.getInt64(kTileSize)))));
-
-      // Unless we know the tile is entirely in bounds, we have to emit a
-      // x-in-bounds check before reading from the input.
-      if (!tile_in_bounds) {
-        llvm_ir::LlvmIfData if_x_in_bounds_data = llvm_ir::EmitIfThenElse(
-            ir_builder_.CreateICmpULT(x, ir_builder_.getInt64(width)),
-            "x_in_bounds", &ir_builder_);
-
-        // Points ir_builder_ to the then-block.
-        llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block,
-                                       &ir_builder_);
-      }
+        lane_id, ir_builder_.CreateNSWMul(
+                     ir_builder_.getInt64(kWarpSize),
+                     ir_builder_.CreateNSWAdd(
+                         ir_builder_.getInt64(x_tile_size - 1),
+                         ir_builder_.CreateNSWMul(
+                             warp_id, ir_builder_.getInt64(x_tile_size)))));
+
+    KernelSupportLibrary ksl(
+        &ir_builder_,
+        /*unroll_mode=*/xla::llvm_ir::UnrollMode::kFullyUnroll,
+        /*prevent_vectorization=*/false);
+
+    // Emit a for-loop that partially reduces the elements in the given
+    // z-x-tile.
+    auto emit_z_x_tile_element_loop = [&](bool x_tile_in_bounds,
+                                          int64 x_tile_loop_bound) -> Status {
+      auto emit_z_tile_element_loop = [&](llvm::Value* z_indvar) -> Status {
+        llvm::Value* z = ir_builder_.CreateNSWAdd(
+            z_indvar, ir_builder_.CreateNSWMul(
+                          ir_builder_.getInt64(z_tile_size), z_tile));
+
+        TF_RETURN_IF_ERROR(ksl.For(
+            "x_tile",
+            /*start=*/0, /*end=*/x_tile_loop_bound, /*step=*/1,
+            [&](llvm::Value* x_indvar) -> Status {
+              // x = lane_id + warpSize * (element_id_in_x_tile + warp_id *
+              // x_tile_size);
+              llvm::Value* x = ir_builder_.CreateNSWAdd(
+                  lane_id,
+                  ir_builder_.CreateNSWMul(
+                      ir_builder_.getInt64(kWarpSize),
+                      ir_builder_.CreateNSWAdd(
+                          x_indvar,
+                          ir_builder_.CreateNSWMul(
+                              warp_id, ir_builder_.getInt64(x_tile_size)))));
+
+              // Unless we know the x-tile is entirely in bounds, we have to
+              // emit a x-in-bounds check before reading from the input.
+              if (!x_tile_in_bounds) {
+                llvm_ir::LlvmIfData if_x_in_bounds_data =
+                    llvm_ir::EmitIfThenElse(ir_builder_.CreateICmpULT(
+                                                x, ir_builder_.getInt64(width)),
+                                            "x_in_bounds", &ir_builder_);
+                // Points ir_builder_ to the then-block.
+                llvm_ir::SetToFirstInsertPoint(if_x_in_bounds_data.true_block,
+                                               &ir_builder_);
+              }
+
+              // Emit code that reads the input element and accumulates it
+              // to the partial reduction result.
+              llvm::Value* input_address =
+                  ir_builder_.CreateAlloca(element_ir_type);
+              {
+                // {z,y,x} is an index to input_3d_tensor_shape
+                // [depth,height,width]. We need to convert that to an index
+                // to input_shape (the shape of the operand of "reduce").
+                // This conversion is composed of a transposition from
+                // input_shape to normalized_input_shape and a reshape from
+                // normalized_input_shape to input_3d_tensor_shape.
+                const Shape normalized_input_shape = ShapeUtil::
+                    MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+                        input_shape);
+                auto input_shape_min2maj =
+                    LayoutUtil::MinorToMajor(input_shape);
+                const std::vector<int64> transpose_dimension_mapping(
+                    input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
+                const Shape input_3d_tensor_shape =
+                    ShapeUtil::MakeShapeWithDescendingLayout(
+                        input_shape.element_type(), {depth, height, width});
+                const llvm_ir::IrArray::Index input_3d_tensor_index(
+                    {z, y, x}, input_3d_tensor_shape, &ir_builder_);
+                const llvm_ir::IrArray::Index input_index =
+                    input_3d_tensor_index
+                        .SourceIndexOfReshape(input_3d_tensor_shape,
+                                              normalized_input_shape,
+                                              &ir_builder_)
+                        .SourceIndexOfTranspose(
+                            normalized_input_shape, input_shape,
+                            transpose_dimension_mapping, &ir_builder_);
+
+                for (int i = 0; i != num_reduces; ++i) {
+                  TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
+                                      input_gens[i](input_index));
+                  ir_builder_.CreateStore(input_ir_value, input_address);
+                  TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+                      *reducers[i],
+                      {partial_reduction_result_addresses[i], input_address},
+                      partial_reduction_result_addresses[i]));
+                }
+                return EmitExtraOutputsForReduce(reduce, input_index,
+                                                 extra_output_gens);
+              }
+            }));
+        return Status::OK();
+      };
 
-      // Emit code that reads the input element and accumulates it to the
-      // partial reduction result.
-      llvm::Value* input_address = ir_builder_.CreateAlloca(element_ir_type);
-      {
-        // {z,y,x} is an index to input_3d_tensor_shape [depth,height,width]. We
-        // need to convert that to an index to input_shape (the shape of the
-        // operand of "reduce"). This conversion is composed of a transposition
-        // from input_shape to normalized_input_shape and a reshape from
-        // normalized_input_shape to input_3d_tensor_shape.
-        const Shape normalized_input_shape =
-            ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-                input_shape);
-        auto input_shape_min2maj = LayoutUtil::MinorToMajor(input_shape);
-        const std::vector<int64> transpose_dimension_mapping(
-            input_shape_min2maj.rbegin(), input_shape_min2maj.rend());
-        const Shape input_3d_tensor_shape =
-            ShapeUtil::MakeShapeWithDescendingLayout(input_shape.element_type(),
-                                                     {depth, height, width});
-        const llvm_ir::IrArray::Index input_3d_tensor_index(
-            {z, y, x}, input_3d_tensor_shape, &ir_builder_);
-        const llvm_ir::IrArray::Index input_index =
-            input_3d_tensor_index
-                .SourceIndexOfReshape(input_3d_tensor_shape,
-                                      normalized_input_shape, &ir_builder_)
-                .SourceIndexOfTranspose(normalized_input_shape, input_shape,
-                                        transpose_dimension_mapping,
-                                        &ir_builder_);
-        for (int i = 0; i != num_reduces; ++i) {
-          TF_ASSIGN_OR_RETURN(llvm::Value* const input_ir_value,
-                              input_gens[i](input_index));
-          ir_builder_.CreateStore(input_ir_value, input_address);
-          TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
-              *reducers[i],
-              {partial_reduction_result_addresses[i], input_address},
-              partial_reduction_result_addresses[i]));
-        }
-        return EmitExtraOutputsForReduce(reduce, input_index,
-                                         extra_output_gens);
-      }
+      return ksl.For("z_tile",
+                     /*start=*/0, /*end=*/z_tile_size, /*step=*/1,
+                     emit_z_tile_element_loop);
     };
 
     llvm::Value* tile_in_bounds = ir_builder_.CreateOr(
-        ir_builder_.getInt1(width % (kTileSize * kWarpSize) == 0),
+        ir_builder_.getInt1(width % (x_tile_size * kWarpSize) == 0),
         ir_builder_.CreateICmpULT(last_x, ir_builder_.getInt64(width)));
-    llvm_ir::LlvmIfData if_tile_in_bounds_data =
-        llvm_ir::EmitIfThenElse(tile_in_bounds, "tile_in_bounds", &ir_builder_);
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.true_block,
-                                   &ir_builder_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/true));
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.false_block,
-                                   &ir_builder_);
-    TF_RETURN_IF_ERROR(emit_tile_element_loop(/*tile_in_bounds=*/false));
 
-    // After the if-then-else statement on tile_in_bounds, emit calls to
-    // shfl_down that accumulate the partial reduction results of all threads
-    // from the warp.
-    llvm_ir::SetToFirstInsertPoint(if_tile_in_bounds_data.after_block,
-                                   &ir_builder_);
+    TF_RETURN_IF_ERROR(
+        ksl.If(tile_in_bounds,
+               /*true_block_generator=*/
+               [&]() -> Status {
+                 return emit_z_x_tile_element_loop(/*x_tile_in_bounds=*/true,
+                                                   x_tile_size);
+               },
+               /*false_block_generator=*/
+               [&]() -> Status {
+                 return emit_z_x_tile_element_loop(
+                     /*x_tile_in_bounds=*/false,
+                     CeilOfRatio(width % (x_tile_size * kWarpSize), kWarpSize));
+               }));
+
+    // After accumulating the elements of the z_x_tile, emit calls to
+    // shfl_down that accumulate the partial reduction results of all
+    // threads in a warp.
     int bit_width = llvm_ir::GetSizeInBits(element_ir_type);
     // bitcast cannot be applied to aggregate types (even packed ones), so we
     // instead bitcast addresses of load/store to intN* of the same bit-width.
@@ -1666,16 +1732,24 @@ Status IrEmitterUnnested::EmitRowReduction(
                                              reduce_output_shapes[i]),
                       &ir_builder_),
                   &ir_builder_, "output_element_address");
-      TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
-          *reducers[i], output_address, partial_reduction_result_addresses[i]));
+      if (x_tile_size * z_tile_size < depth * width) {
+        TF_RETURN_IF_ERROR(EmitAtomicOperationForNestedComputation(
+            *reducers[i], output_address,
+            partial_reduction_result_addresses[i]));
+      } else {
+        TF_RETURN_IF_ERROR(EmitCallToNestedComputation(
+            *reducers[i],
+            {output_address, partial_reduction_result_addresses[i]},
+            output_address));
+      }
     }
     return Status::OK();
   };
 
   // Emit a parallel loop that iterates through every input tiles.
   Shape tiled_input_shape = ShapeUtil::MakeShapeWithLayout(
-      reduce->shape().element_type(), {depth, height, width_in_tiles},
-      {2, 1, 0});
+      reduce->shape().element_type(),
+      {depth / z_tile_size, height, width_in_tiles}, {2, 1, 0});
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       tiled_input_shape, ir_emitter_context_->device_description());
   CHECK(LastThunk()->kind() == Thunk::Kind::kSequential);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
index 23d2d4e87d..1f6e3c829f 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.cc
@@ -15,53 +15,57 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 
-#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 
 namespace xla {
-void KernelSupportLibrary::For(
+Status KernelSupportLibrary::For(
     tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step,
-    const std::function<void(llvm::Value*, bool)>& for_body_generator) {
-  If(ir_builder_->CreateICmpSLT(start, end), [&]() {
-    for_body_generator(start, /*is_first_iteration=*/true);
-    For(name, ir_builder_->CreateAdd(start, step), end, step,
-        [&](llvm::Value* iv) { for_body_generator(iv, false); });
+    const std::function<Status(llvm::Value*, bool)>& for_body_generator) {
+  return If(ir_builder_->CreateICmpSLT(start, end), [&]() -> Status {
+    TF_RETURN_IF_ERROR(for_body_generator(start, /*is_first_iteration=*/true));
+    return For(name, ir_builder_->CreateAdd(start, step), end, step,
+               [&](llvm::Value* iv) { return for_body_generator(iv, false); });
   });
 }
 
-void KernelSupportLibrary::For(
+Status KernelSupportLibrary::For(
     tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
     llvm::Value* step, bool peel_first_iteration,
-    const std::function<void(llvm::Value*, llvm::Value*)>& for_body_generator) {
+    const std::function<Status(llvm::Value*, llvm::Value*)>&
+        for_body_generator) {
   if (peel_first_iteration) {
-    For(name, start, end, step, true,
-        [&](llvm::Value* indvar, bool is_first_iteration) {
-          for_body_generator(indvar, ir_builder_->getInt1(is_first_iteration));
-        });
+    return For(name, start, end, step, true,
+               [&](llvm::Value* indvar, bool is_first_iteration) -> Status {
+                 return for_body_generator(
+                     indvar, ir_builder_->getInt1(is_first_iteration));
+               });
   } else {
     std::unique_ptr<llvm_ir::ForLoop> loop = llvm_ir::ForLoop::EmitForLoop(
         name, start, end, step, ir_builder_,
-        /*prevent_unrolling=*/prevent_unrolling_,
+        /*unroll_mode=*/unroll_mode_,
         /*prevent_vectorization=*/prevent_vectorization_);
     ir_builder_->SetInsertPoint(&loop->GetBodyBasicBlock()->back());
-    for_body_generator(loop->GetIndVarValue(),
-                       /*is_first_iteration=*/ir_builder_->CreateICmpEQ(
-                           loop->GetIndVarValue(), start));
+    TF_RETURN_IF_ERROR(
+        for_body_generator(loop->GetIndVarValue(),
+                           /*is_first_iteration=*/ir_builder_->CreateICmpEQ(
+                               loop->GetIndVarValue(), start)));
     llvm_ir::SetToLastInsertPoint(loop->GetExitBasicBlock(), ir_builder_);
+    return Status::OK();
   }
 }
 
-void KernelSupportLibrary::If(
-    llvm::Value* condition, const std::function<void()>& true_block_generator,
-    const std::function<void()>& false_block_generator) {
+Status KernelSupportLibrary::If(
+    llvm::Value* condition, const std::function<Status()>& true_block_generator,
+    const std::function<Status()>& false_block_generator) {
   llvm_ir::LlvmIfData if_data =
       llvm_ir::EmitIfThenElse(condition, "", ir_builder_);
   ir_builder_->SetInsertPoint(&if_data.true_block->back());
-  true_block_generator();
+  TF_RETURN_IF_ERROR(true_block_generator());
   ir_builder_->SetInsertPoint(&if_data.false_block->back());
-  false_block_generator();
+  TF_RETURN_IF_ERROR(false_block_generator());
   llvm_ir::SetToLastInsertPoint(if_data.after_block, ir_builder_);
+  return Status::OK();
 }
 
 void KernelSupportLibrary::EmitAndCallOutlinedKernel(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
index 64b935bbf1..e17c649e52 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -30,13 +31,14 @@ namespace xla {
 class KernelSupportLibrary {
  public:
   // `ir_builder` is the llvm::IRBuilder instance used to generate LLVM IR.
-  // If `prevent_unrolling` is true then unrolling is explicitly disabled on
-  // every loop generated by this instance of KernelSupportLibrary.
-  explicit KernelSupportLibrary(llvm::IRBuilder<>* ir_builder,
-                                bool prevent_unrolling = true,
-                                bool prevent_vectorization = true)
+  // `unroll_mode` specifies the desired LLVM unrolling behavior for every loop
+  // generated by this instance of KernelSupportLibrary.
+  explicit KernelSupportLibrary(
+      llvm::IRBuilder<>* ir_builder,
+      llvm_ir::UnrollMode unroll_mode = llvm_ir::UnrollMode::kNoUnroll,
+      bool prevent_vectorization = true)
       : ir_builder_(ir_builder),
-        prevent_unrolling_(prevent_unrolling),
+        unroll_mode_(unroll_mode),
         prevent_vectorization_(prevent_vectorization) {}
 
   // Generates the following control flow structure:
@@ -46,19 +48,41 @@ class KernelSupportLibrary {
   //     for (i64 i = `start` + `step`; i s< `end`; i += `step`)
   //       `for_body_generator(/*ind_var=*/,i, /*is_first_iteration=*/false)`;
   //   }
-  void For(
+  Status For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<Status(llvm::Value* ind_var,
+                                 bool is_first_iteration)>& for_body_generator);
+
+  void ForReturnVoid(
       tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
-          for_body_generator);
+          for_body_generator) {
+    CHECK_EQ(Status::OK(),
+             For(name, start, end, step,
+                 [&](llvm::Value* ind_var, bool is_first_iteration) -> Status {
+                   for_body_generator(ind_var, is_first_iteration);
+                   return Status::OK();
+                 }));
+  }
+
+  Status For(tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+             const std::function<Status(llvm::Value* ind_var,
+                                        bool is_first_iteration)>&
+                 for_body_generator) {
+    return For(name, /*start=*/ir_builder_->getInt64(start),
+               /*end=*/ir_builder_->getInt64(end),
+               /*step=*/ir_builder_->getInt64(step), for_body_generator);
+  }
 
-  void For(
+  void ForReturnVoid(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var, bool is_first_iteration)>&
           for_body_generator) {
-    For(name, /*start=*/ir_builder_->getInt64(start),
-        /*end=*/ir_builder_->getInt64(end),
-        /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    ForReturnVoid(name, /*start=*/ir_builder_->getInt64(start),
+                  /*end=*/ir_builder_->getInt64(end),
+                  /*step=*/ir_builder_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure if `peel_first_iteration` is
@@ -75,46 +99,101 @@ class KernelSupportLibrary {
   //   for (i64 i = `start`; i s< `end`; i += `step`)
   //     `for_body_generator(/*ind_var=*/,i,
   //                         /*is_first_iteration=*/,(i != `start`))`;
-  void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-           llvm::Value* step, bool peel_first_iteration,
-           const std::function<void(llvm::Value* ind_var,
-                                    llvm::Value* is_first_iteration)>&
+  Status For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+             llvm::Value* step, bool peel_first_iteration,
+             const std::function<Status(llvm::Value* ind_var,
+                                        llvm::Value* is_first_iteration)>&
+                 for_body_generator);
+
+  void ForReturnVoid(tensorflow::StringPiece name, llvm::Value* start,
+                     llvm::Value* end, llvm::Value* step,
+                     bool peel_first_iteration,
+                     const std::function<void(llvm::Value* ind_var,
+                                              llvm::Value* is_first_iteration)>&
+                         for_body_generator) {
+    TF_CHECK_OK(For(
+        name, start, end, step, peel_first_iteration,
+        [&](llvm::Value* ind_var, llvm::Value* is_first_iteration) -> Status {
+          for_body_generator(ind_var, is_first_iteration);
+          return Status::OK();
+        }));
+  }
+
+  Status For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+             int64 step, bool peel_first_iteration,
+             const std::function<Status(llvm::Value* ind_var,
+                                        llvm::Value* is_first_iteration)>&
+                 for_body_generator) {
+    return For(name, /*start=*/start, /*end=*/end,
+               /*step=*/ir_builder_->getInt64(step), peel_first_iteration,
                for_body_generator);
+  }
+
+  void ForReturnVoid(tensorflow::StringPiece name, llvm::Value* start,
+                     llvm::Value* end, int64 step, bool peel_first_iteration,
+                     const std::function<void(llvm::Value* ind_var,
+                                              llvm::Value* is_first_iteration)>&
+                         for_body_generator) {
+    ForReturnVoid(name, /*start=*/start, /*end=*/end,
+                  /*step=*/ir_builder_->getInt64(step), peel_first_iteration,
+                  for_body_generator);
+  }
 
-  void For(tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
-           int64 step, bool peel_first_iteration,
-           const std::function<void(llvm::Value* ind_var,
-                                    llvm::Value* is_first_iteration)>&
-               for_body_generator) {
-    For(name, /*start=*/start, /*end=*/end,
-        /*step=*/ir_builder_->getInt64(step), peel_first_iteration,
-        for_body_generator);
+  Status For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      llvm::Value* step,
+      const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
+    return For(name, start, end, step,
+               /*peel_first_iteration=*/false,
+               [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                 return for_body_generator(indvar);
+               });
   }
 
-  void For(
+  void ForReturnVoid(
       tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
       llvm::Value* step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    For(name, start, end, step,
-        /*peel_first_iteration=*/false,
-        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
+    ForReturnVoid(name, start, end, step,
+                  /*peel_first_iteration=*/false,
+                  [&](llvm::Value* indvar, llvm::Value*) {
+                    return for_body_generator(indvar);
+                  });
+  }
+
+  Status For(
+      tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
+      int64 step,
+      const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
+    return For(name, start, end, ir_builder_->getInt64(step),
+               /*peel_first_iteration=*/false,
+               [&](llvm::Value* indvar, llvm::Value*) -> Status {
+                 return for_body_generator(indvar);
+               });
   }
 
-  void For(
+  void ForReturnVoid(
       tensorflow::StringPiece name, llvm::Value* start, llvm::Value* end,
       int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    For(name, start, end, ir_builder_->getInt64(step),
-        /*peel_first_iteration=*/false,
-        [&](llvm::Value* indvar, llvm::Value*) { for_body_generator(indvar); });
+    ForReturnVoid(name, start, end, ir_builder_->getInt64(step),
+                  for_body_generator);
+  }
+
+  Status For(
+      tensorflow::StringPiece name, int64 start, int64 end, int64 step,
+      const std::function<Status(llvm::Value* ind_var)>& for_body_generator) {
+    return For(name, /*start=*/ir_builder_->getInt64(start),
+               /*end=*/ir_builder_->getInt64(end),
+               /*step=*/ir_builder_->getInt64(step), for_body_generator);
   }
 
-  void For(
+  void ForReturnVoid(
       tensorflow::StringPiece name, int64 start, int64 end, int64 step,
       const std::function<void(llvm::Value* ind_var)>& for_body_generator) {
-    For(name, /*start=*/ir_builder_->getInt64(start),
-        /*end=*/ir_builder_->getInt64(end),
-        /*step=*/ir_builder_->getInt64(step), for_body_generator);
+    ForReturnVoid(name, /*start=*/ir_builder_->getInt64(start),
+                  /*end=*/ir_builder_->getInt64(end),
+                  /*step=*/ir_builder_->getInt64(step), for_body_generator);
   }
 
   // Generates the following control flow structure:
@@ -123,9 +202,25 @@ class KernelSupportLibrary {
   //     `true_block_generator()`;
   //   else
   //      `false_block_generator()`;
-  void If(llvm::Value* condition,
-          const std::function<void()>& true_block_generator,
-          const std::function<void()>& false_block_generator = []() {});
+  Status If(llvm::Value* condition,
+            const std::function<Status()>& true_block_generator,
+            const std::function<Status()>& false_block_generator =
+                []() -> Status { return Status::OK(); });
+
+  void IfReturnVoid(llvm::Value* condition,
+                    const std::function<void()>& true_block_generator,
+                    const std::function<void()>& false_block_generator = []() {
+                    }) {
+    TF_CHECK_OK(If(condition,
+                   [&]() {
+                     true_block_generator();
+                     return Status::OK();
+                   },
+                   [&]() {
+                     false_block_generator();
+                     return Status::OK();
+                   }));
+  }
 
   using ArgumentVector = tensorflow::gtl::ArraySlice<llvm::Value*>;
 
@@ -183,7 +278,7 @@ class KernelSupportLibrary {
 
  private:
   llvm::IRBuilder<>* ir_builder_;
-  bool prevent_unrolling_;
+  llvm_ir::UnrollMode unroll_mode_;
   bool prevent_vectorization_;
 };
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
index 497b48ff22..9f867014fb 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.cc
@@ -34,7 +34,7 @@ namespace llvm_ir {
 
 ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
                  llvm::Value* start_index, llvm::Value* end_index,
-                 llvm::Value* step, bool prevent_unrolling,
+                 llvm::Value* step, UnrollMode unroll_mode,
                  bool prevent_vectorization)
     : prefix_(std::string(prefix)),
       suffix_(std::string(suffix)),
@@ -42,15 +42,15 @@ ForLoop::ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
       end_index_(end_index),
       step_(step),
       insert_before_bb_(nullptr),
-      prevent_unrolling_(prevent_unrolling),
+      unroll_mode_(unroll_mode),
       prevent_vectorization_(prevent_vectorization) {}
 
 /* static */ std::unique_ptr<ForLoop> ForLoop::EmitForLoop(
     tensorflow::StringPiece prefix, llvm::Value* start_index,
     llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
-    bool prevent_unrolling, bool prevent_vectorization) {
+    UnrollMode unroll_mode, bool prevent_vectorization) {
   std::unique_ptr<ForLoop> loop(new ForLoop(prefix, /*suffix=*/"", start_index,
-                                            end_index, step, prevent_unrolling,
+                                            end_index, step, unroll_mode,
                                             prevent_vectorization));
   loop->Emit(ir_builder);
   return loop;
@@ -147,11 +147,12 @@ void ForLoop::Emit(llvm::IRBuilder<>* ir_builder) {
 std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
     llvm::IRBuilder<>* ir_builder) {
   const char* const kLlvmLoopUnrollDisableMDName = "llvm.loop.unroll.disable";
+  const char* const kLlvmLoopUnrollFullMDName = "llvm.loop.unroll.full";
   const char* const kLlvmLoopVectorizeMDName = "llvm.loop.vectorize.enable";
   llvm::LLVMContext* ctx = &start_index_->getContext();
 
   std::vector<llvm::Metadata*> result;
-  if (prevent_unrolling_) {
+  if (unroll_mode_ == xla::llvm_ir::UnrollMode::kNoUnroll) {
     result.push_back(llvm::MDNode::get(
         *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollDisableMDName)}));
   }
@@ -162,6 +163,10 @@ std::vector<llvm::Metadata*> ForLoop::GetLoopMetadata(
                llvm::ConstantAsMetadata::get(ir_builder->getFalse())}));
   }
 
+  if (unroll_mode_ == xla::llvm_ir::UnrollMode::kFullyUnroll) {
+    result.push_back(llvm::MDNode::get(
+        *ctx, {llvm::MDString::get(*ctx, kLlvmLoopUnrollFullMDName)}));
+  }
   return result;
 }
 
@@ -178,25 +183,25 @@ llvm::BasicBlock* ForLoop::CreateLoopBB(tensorflow::StringPiece name,
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
-                                              bool prevent_unrolling,
+                                              UnrollMode unroll_mode,
                                               bool prevent_vectorization) {
   return AddLoop(suffix, start_index, end_index, ir_builder_->getInt64(1),
-                 prevent_unrolling, prevent_vectorization);
+                 unroll_mode, prevent_vectorization);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
                                               llvm::Value* start_index,
                                               llvm::Value* end_index,
                                               llvm::Value* stride,
-                                              bool prevent_unrolling,
+                                              UnrollMode unroll_mode,
                                               bool prevent_vectorization) {
   if (inner_loop_body_bb_ != nullptr) {
     // Create this loop inside the previous one.
     ir_builder_->SetInsertPoint(&*inner_loop_body_bb_->getFirstInsertionPt());
   }
   std::unique_ptr<ForLoop> loop(new ForLoop(
-      /*prefix=*/name_, suffix, start_index, end_index, stride,
-      prevent_unrolling, prevent_vectorization));
+      /*prefix=*/name_, suffix, start_index, end_index, stride, unroll_mode,
+      prevent_vectorization));
   loop->Emit(ir_builder_);
 
   if (outer_loop_preheader_bb_ == nullptr) {
@@ -215,23 +220,23 @@ std::unique_ptr<ForLoop> ForLoopNest::AddLoop(tensorflow::StringPiece suffix,
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index,
                                               tensorflow::StringPiece suffix,
-                                              bool prevent_unrolling,
+                                              UnrollMode unroll_mode,
                                               bool prevent_vectorization) {
   CHECK_LE(start_index, end_index);
   return AddLoop(suffix, ir_builder_->getInt64(start_index),
-                 ir_builder_->getInt64(end_index), prevent_unrolling,
+                 ir_builder_->getInt64(end_index), unroll_mode,
                  prevent_vectorization);
 }
 
 std::unique_ptr<ForLoop> ForLoopNest::AddLoop(int64 start_index,
                                               int64 end_index, int64 stride,
                                               tensorflow::StringPiece suffix,
-                                              bool prevent_unrolling,
+                                              UnrollMode unroll_mode,
                                               bool prevent_vectorization) {
   CHECK_LE(start_index, end_index);
   return AddLoop(suffix, ir_builder_->getInt64(start_index),
                  ir_builder_->getInt64(end_index),
-                 ir_builder_->getInt64(stride), prevent_unrolling,
+                 ir_builder_->getInt64(stride), unroll_mode,
                  prevent_vectorization);
 }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
index d915f95db1..4e403cd994 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_loop.h
@@ -34,6 +34,12 @@ limitations under the License.
 namespace xla {
 namespace llvm_ir {
 
+enum class UnrollMode {
+  kDefaultUnroll,
+  kFullyUnroll,
+  kNoUnroll,
+};
+
 // A class for constructing a for-loop in LLVM IR.
 class ForLoop {
  public:
@@ -69,12 +75,13 @@ class ForLoop {
   // LLVM IR. If non-empty, it is prepended to the name of the induction
   // variable value and each basic block created for the loop.
   //
-  // If `prevent_unrolling` is true then emit metadata that directs LLVM to not
-  // unroll the generated loop.
+  // `unroll_mode` specifies the desired LLVM unrolling behavior for generated
+  //  loop.
   static std::unique_ptr<ForLoop> EmitForLoop(
       tensorflow::StringPiece prefix, llvm::Value* start_index,
       llvm::Value* end_index, llvm::Value* step, llvm::IRBuilder<>* ir_builder,
-      bool prevent_unrolling = false, bool prevent_vectorization = false);
+      UnrollMode unroll_mode = llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
 
   // The names of the blocks follow LLVM's conventions. Control flow amongst the
   // blocks for the example C code looks like:
@@ -128,7 +135,7 @@ class ForLoop {
 
   ForLoop(tensorflow::StringPiece prefix, tensorflow::StringPiece suffix,
           llvm::Value* start_index, llvm::Value* end_index, llvm::Value* step,
-          bool prevent_unrolling, bool prevent_vectorization);
+          UnrollMode unroll_mode, bool prevent_vectorization);
 
   // Emit the loop at the insert point of the builder.
   void Emit(llvm::IRBuilder<>* ir_builder);
@@ -161,7 +168,7 @@ class ForLoop {
   llvm::BasicBlock* body_bb_;
   llvm::BasicBlock* exit_bb_;
   llvm::Value* indvar_;
-  bool prevent_unrolling_;
+  UnrollMode unroll_mode_;
   bool prevent_vectorization_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(ForLoop);
@@ -182,34 +189,34 @@ class ForLoopNest {
 
   // Adds a loop to the nest. If no loop has been added yet then emit a loop at
   // the current insert point of the given builder. If one or more loops have
-  // been added then emit loop inside the body of the last added loop.  If
-  // prevent_unrolling is true, then metadata is emitting directing LLVM to not
-  // unroll this loop.
-  std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
-                                   llvm::Value* start_index,
-                                   llvm::Value* end_index, llvm::Value* stride,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+  // been added then emit loop inside the body of the last added loop.
+  // unroll_mode is used to emit metadata that controls LLVM unrolling.
+  std::unique_ptr<ForLoop> AddLoop(
+      tensorflow::StringPiece suffix, llvm::Value* start_index,
+      llvm::Value* end_index, llvm::Value* stride,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
 
   // Like the above, except that it defaults to a stride of one.
-  std::unique_ptr<ForLoop> AddLoop(tensorflow::StringPiece suffix,
-                                   llvm::Value* start_index,
-                                   llvm::Value* end_index,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+  std::unique_ptr<ForLoop> AddLoop(
+      tensorflow::StringPiece suffix, llvm::Value* start_index,
+      llvm::Value* end_index,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
 
   // A convenient wrapper of the other flavor of AddLoop. The given start and
   // end index are constant.
-  std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
-                                   int64 stride, tensorflow::StringPiece suffix,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+  std::unique_ptr<ForLoop> AddLoop(
+      int64 start_index, int64 end_index, int64 stride,
+      tensorflow::StringPiece suffix,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
 
   // Like the above, except that it defaults to a stride of one.
-  std::unique_ptr<ForLoop> AddLoop(int64 start_index, int64 end_index,
-                                   tensorflow::StringPiece suffix,
-                                   bool prevent_unrolling = false,
-                                   bool prevent_vectorization = false);
+  std::unique_ptr<ForLoop> AddLoop(
+      int64 start_index, int64 end_index, tensorflow::StringPiece suffix,
+      UnrollMode unroll_mode = xla::llvm_ir::UnrollMode::kDefaultUnroll,
+      bool prevent_vectorization = false);
 
   // Add loops to iterate through the indices within the specified
   // shape. The returned index collects the induction variables of the
-- 
GitLab


From 73d6c7bef536d4a15cc1c57d8635d3d670ef34de Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 21:31:57 -0700
Subject: [PATCH 472/610] Wire in the kDomain infrastructure brought in by
 cl/193798254.

PiperOrigin-RevId: 199745064
---
 .../compiler/xla/service/computation_layout.h |  9 +++
 tensorflow/compiler/xla/service/hlo_cse.cc    | 11 +--
 .../compiler/xla/service/hlo_instruction.cc   |  8 +--
 .../compiler/xla/service/hlo_instruction.h    |  5 +-
 .../compiler/xla/service/hlo_sharding.cc      | 27 +++++++
 .../compiler/xla/service/hlo_sharding.h       | 15 +++-
 .../xla/service/hlo_sharding_metadata.cc      | 71 +++++++++----------
 .../compiler/xla/service/tuple_simplifier.cc  | 24 ++-----
 8 files changed, 102 insertions(+), 68 deletions(-)

diff --git a/tensorflow/compiler/xla/service/computation_layout.h b/tensorflow/compiler/xla/service/computation_layout.h
index 53c3a3f7b7..6975f387b4 100644
--- a/tensorflow/compiler/xla/service/computation_layout.h
+++ b/tensorflow/compiler/xla/service/computation_layout.h
@@ -32,12 +32,21 @@ namespace xla {
 // mutable layouts.
 class ComputationLayout {
  public:
+  // Creates a new ComputationLayout with the given result layout.
+  explicit ComputationLayout(ShapeLayout result_layout)
+      : result_layout_(std::move(result_layout)) {}
+
   // Constructs a ComputationLayout from a ProgramShape. The layouts of the
   // parameters and results are set to the default layout. Layouts in the
   // ProgramShape are ignored if ignore_layouts is true.
   explicit ComputationLayout(const ProgramShape& program_shape,
                              bool ignore_layouts = true);
 
+  // Adds a new parameter layout to the computation layout.
+  void add_parameter_layout(ShapeLayout shape_layout) {
+    parameter_layouts_.push_back(std::move(shape_layout));
+  }
+
   // Returns the layout of a particular parameter.
   const ShapeLayout& parameter_layout(int64 param_no) const {
     return parameter_layouts_[param_no];
diff --git a/tensorflow/compiler/xla/service/hlo_cse.cc b/tensorflow/compiler/xla/service/hlo_cse.cc
index dab946a099..a0ee889623 100644
--- a/tensorflow/compiler/xla/service/hlo_cse.cc
+++ b/tensorflow/compiler/xla/service/hlo_cse.cc
@@ -135,17 +135,18 @@ StatusOr<bool> HloCSE::Run(HloModule* module) {
     // instruction for each class.
     tensorflow::gtl::FlatSet<HloInstruction*, decltype(&CseHash),
                              decltype(cse_equal)>
-        representatives(/*N=*/1024, &CseHash, cse_equal);
-
+        representatives(/*N=*/computation->instruction_count() + 1, &CseHash,
+                        cse_equal);
     for (auto instruction : computation->MakeInstructionPostOrder()) {
       // If the instruction has zero operands (constants, parameters, etc.) skip
       // over it.
       if (instruction->operand_count() == 0) {
         continue;
       }
-
-      // Skip instructions which have side effects.
-      if (instruction->HasSideEffect()) {
+      // Skip instructions which have side effects or are a domain (which must
+      // not be CSE-ed).
+      if (instruction->HasSideEffect() ||
+          instruction->opcode() == HloOpcode::kDomain) {
         continue;
       }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 570ad5459a..b6e2056600 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -844,12 +844,12 @@ HloInstruction::CreateBroadcastSequence(
   return instruction;
 }
 
-void HloInstruction::set_device_sharding(int64 device) {
-  HloSharding device_sharding = HloSharding::AssignDevice(device);
+void HloInstruction::set_single_sharding(const HloSharding& sharding) {
+  CHECK(!sharding.IsTuple()) << sharding;
   if (ShapeUtil::IsTuple(shape())) {
-    set_sharding(HloSharding::Tuple(device_sharding.GetAsShapeTree(shape())));
+    set_sharding(HloSharding::Tuple(sharding.GetAsShapeTree(shape())));
   } else {
-    set_sharding(device_sharding);
+    set_sharding(sharding);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 6232d55e1b..c08806b33b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1119,8 +1119,11 @@ class HloInstruction {
   void set_sharding(const HloSharding& sharding) {
     sharding_ = MakeUnique<HloSharding>(sharding);
   }
+  void set_single_sharding(const HloSharding& sharding);
   // Sets a sharding that assigns the current instruction to device.
-  void set_device_sharding(int64 device);
+  void set_device_sharding(int64 device) {
+    set_single_sharding(HloSharding::AssignDevice(device));
+  }
   // Remove any sharding from this operator.
   void clear_sharding() { sharding_ = nullptr; }
   // Return true if this operator has a sharding assigned.
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.cc b/tensorflow/compiler/xla/service/hlo_sharding.cc
index 58224ef870..4fbb7f69ac 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding.cc
@@ -141,6 +141,20 @@ StatusOr<ShapeTree<HloSharding>> HloSharding::AsShapeTree(
   }
 }
 
+StatusOr<HloSharding> HloSharding::GetTupleSharding(const Shape& shape) const {
+  if (IsTuple()) {
+    // TODO(b/109903108): An empty tuple has one leaf for ShapeTree, while it
+    // has zero leaves for ShapeUtil. This needs cleanup.
+    int64 shape_leaves =
+        ShapeUtil::IsEmptyTuple(shape) ? 1 : ShapeUtil::GetLeafCount(shape);
+    TF_RET_CHECK(shape_leaves == tuple_elements_.size())
+        << "Shape " << ShapeUtil::HumanString(shape) << " has " << shape_leaves
+        << " leaf nodes while this sharding has " << tuple_elements_.size();
+    return *this;
+  }
+  return Tuple(ShapeTree<HloSharding>(shape, *this));
+}
+
 StatusOr<int64> HloSharding::UniqueDevice() const {
   if (IsTuple()) {
     if (tuple_elements_.empty()) {
@@ -389,6 +403,19 @@ HloSharding HloSharding::GetSubSharding(const Shape& shape,
                                        : sub_shape_tree.element(ShapeIndex({}));
 }
 
+tensorflow::gtl::optional<HloSharding> HloSharding::ExtractSingleSharding()
+    const {
+  if (!IsTuple()) {
+    return *this;
+  }
+  for (int64 i = 1; i < tuple_elements_.size(); ++i) {
+    if (tuple_elements_[0] != tuple_elements_[i]) {
+      return tensorflow::gtl::optional<HloSharding>();
+    }
+  }
+  return tuple_elements_.front();
+}
+
 std::ostream& operator<<(std::ostream& out, const HloSharding& sharding) {
   out << sharding.ToString();
   return out;
diff --git a/tensorflow/compiler/xla/service/hlo_sharding.h b/tensorflow/compiler/xla/service/hlo_sharding.h
index f4a0fb626f..0a213311b4 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding.h
+++ b/tensorflow/compiler/xla/service/hlo_sharding.h
@@ -72,8 +72,7 @@ class HloSharding {
   // elements for every leaf shape contained in the tuple.
   static HloSharding Tuple(const ShapeTree<HloSharding>& sub_shardings) {
     std::vector<HloSharding> flattened_list;
-    flattened_list.reserve(
-        std::distance(sub_shardings.leaf_begin(), sub_shardings.leaf_end()));
+    flattened_list.reserve(sub_shardings.leaf_count());
     for (const auto& index_to_sharding : sub_shardings.leaves()) {
       flattened_list.push_back(index_to_sharding.second);
     }
@@ -172,6 +171,18 @@ class HloSharding {
   // REQUIRES: IsTuple()
   HloSharding GetSubSharding(const Shape& shape, const ShapeIndex& index) const;
 
+  // If the current sharding is a tuple sharding, return itself as result.
+  // Otherwise returns a tuple sharding for the input shape, with all the leaves
+  // having this object sharding.
+  StatusOr<HloSharding> GetTupleSharding(const Shape& shape) const;
+
+  // Extracts the sharding that is common within the current sharding.
+  // If the current sharding is not a tuple sharding, the current sharding will
+  // be returned. If it is a tuple, and all the tuple elements are common, the
+  // common element will be returned. Otherwise the optional will contain no
+  // value.
+  tensorflow::gtl::optional<HloSharding> ExtractSingleSharding() const;
+
   bool operator==(const HloSharding& other) const {
     return replicated_ == other.replicated_ && maximal_ == other.maximal_ &&
            ShapeUtil::Compatible(tile_shape_, other.tile_shape_) &&
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index 82cff2a4b7..7b4b071af4 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -31,32 +31,22 @@ struct PassThrough {
   HloInstruction* operand = nullptr;
 };
 
-void SetDeviceSharding(HloInstruction* instruction, int64 device) {
-  VLOG(4) << "  " << instruction->name() << " to device " << device;
-  instruction->set_device_sharding(device);
-}
-
-tensorflow::gtl::optional<int64> ShardingUniqueDevice(
-    const HloSharding& sharding) {
-  if (sharding.IsTileMaximal()) {
-    auto device = sharding.UniqueDevice();
-    if (device.ok()) {
-      return device.ValueOrDie();
-    }
-  }
-  return tensorflow::gtl::optional<int64>();
+void SetSingleSharding(HloInstruction* instruction,
+                       const HloSharding& sharding) {
+  VLOG(4) << "  " << instruction->name() << " to " << sharding;
+  instruction->set_single_sharding(sharding);
 }
 
 bool ShardingMatches(const HloSharding& sharding1,
                      const HloSharding& sharding2) {
-  auto device1 = ShardingUniqueDevice(sharding1);
-  if (device1) {
-    auto device2 = ShardingUniqueDevice(sharding2);
-    if (device2) {
-      return *device1 == *device2;
+  auto single_sharding1 = sharding1.ExtractSingleSharding();
+  if (single_sharding1) {
+    auto single_sharding2 = sharding2.ExtractSingleSharding();
+    if (single_sharding2) {
+      return *single_sharding1 == single_sharding2;
     }
   }
-  // Anything which is not tile maximal with unique device, gets a full sharding
+  // Anything which is not unique across all elements, gets a full sharding
   // compare.
   return sharding1 == sharding2;
 }
@@ -119,21 +109,21 @@ Status FixupPassThroughDomainLinks(const DomainMetadata::Domain& domain,
 
 std::unique_ptr<HloSharding> CloneShardingForDomain(
     const HloSharding& sharding) {
-  auto device = ShardingUniqueDevice(sharding);
-  if (!device) {
+  auto single_sharding = sharding.ExtractSingleSharding();
+  if (!single_sharding) {
     return MakeUnique<HloSharding>(sharding);
   }
-  return MakeUnique<HloSharding>(HloSharding::AssignDevice(*device));
+  return MakeUnique<HloSharding>(*single_sharding);
 }
 
-Status ApplyDomainDeviceSharding(const DomainMetadata::Domain& domain,
-                                 int64 device) {
-  VLOG(4) << "Applying device " << device << " sharding";
+Status ApplyDomainSingleSharding(const DomainMetadata::Domain& domain,
+                                 const HloSharding& sharding) {
+  VLOG(4) << "Applying " << sharding << " sharding";
   for (HloInstruction* instruction : domain.instructions) {
     // We only change instructions without sharding, since otherwise we might
     // mess up with eventual HLO passes which has knowledge of it.
     if (!instruction->has_sharding()) {
-      SetDeviceSharding(instruction, device);
+      SetSingleSharding(instruction, sharding);
     } else {
       VLOG(4) << "  " << instruction->name() << " already has sharding "
               << instruction->sharding();
@@ -186,12 +176,15 @@ StatusOr<int64> ApplyDomainShardingPass(const DomainMetadata::Domain& domain,
       const HloSharding* tuple_sharding =
           GetOperandSharding(tuple, domain, sharding);
       if (tuple_sharding != nullptr) {
-        TF_RET_CHECK(tuple_sharding->IsTuple()) << tuple->ToString();
-        HloSharding sub_sharding = tuple_sharding->GetSubSharding(
-            tuple->shape(), {instruction->tuple_index()});
-        VLOG(4) << "  " << instruction->name() << " to sharding "
-                << sub_sharding;
-        instruction->set_sharding(sub_sharding);
+        if (tuple_sharding->IsTuple()) {
+          HloSharding sub_sharding = tuple_sharding->GetSubSharding(
+              tuple->shape(), {instruction->tuple_index()});
+          VLOG(4) << "  " << instruction->name() << " to sharding "
+                  << sub_sharding;
+          instruction->set_sharding(sub_sharding);
+        } else {
+          SetSingleSharding(instruction, *tuple_sharding);
+        }
         ++assigned;
       }
     } else if (instruction->opcode() == HloOpcode::kTuple) {
@@ -242,12 +235,12 @@ StatusOr<int64> ApplyDomainShardingPass(const DomainMetadata::Domain& domain,
 
 Status ApplyDomainSharding(const DomainMetadata::Domain& domain,
                            const HloSharding& sharding) {
-  auto device = ShardingUniqueDevice(sharding);
-  if (device) {
-    // Shortcut the simple case. We have a unique device sharding, so we call
-    // the ApplyDomainDeviceSharding() API which will apply array or tuple
-    // shaped device sharding to the domain instructions.
-    return ApplyDomainDeviceSharding(domain, *device);
+  auto single_sharding = sharding.ExtractSingleSharding();
+  if (single_sharding) {
+    // Shortcut the simple case. We have a unique sharding, so we call
+    // the ApplyDomainSingleSharding() API which will apply array or tuple
+    // shaped sharding to the domain instructions.
+    return ApplyDomainSingleSharding(domain, *single_sharding);
   }
   VLOG(1) << "Assigning non-trivial sharding " << sharding;
   for (;;) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index d668855084..e536c8afbf 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -69,7 +69,6 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
       //       Tuple
       //
       HloInstruction* top_tuple = nullptr;
-      HloInstruction* first_gte = nullptr;
       bool can_simplify = true;
       for (int64 operand_number = 0;
            operand_number < instruction->operand_count(); ++operand_number) {
@@ -79,17 +78,10 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
           can_simplify = false;
           break;
         }
-        if (first_gte == nullptr) {
-          first_gte = operand;
-        } else if (!first_gte->has_compatible_sharding(operand)) {
-          can_simplify = false;
-          break;
-        }
         if (top_tuple == nullptr) {
           top_tuple = operand->mutable_operand(0);
           if (!ShapeUtil::Compatible(top_tuple->shape(),
-                                     instruction->shape()) ||
-              !instruction->has_compatible_sharding(top_tuple)) {
+                                     instruction->shape())) {
             can_simplify = false;
             break;
           }
@@ -118,14 +110,12 @@ StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
         HloInstruction* element_source =
             instruction->mutable_operand(0)->mutable_operand(
                 instruction->tuple_index());
-        if (instruction->has_compatible_sharding(element_source)) {
-          changed = true;
-          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
-          for (HloInstruction* user : element_source->users()) {
-            if (user->opcode() == HloOpcode::kTuple ||
-                user->opcode() == HloOpcode::kGetTupleElement) {
-              worklist.push(user);
-            }
+        changed = true;
+        TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(element_source));
+        for (HloInstruction* user : element_source->users()) {
+          if (user->opcode() == HloOpcode::kTuple ||
+              user->opcode() == HloOpcode::kGetTupleElement) {
+            worklist.push(user);
           }
         }
       }
-- 
GitLab


From 1b058574373555c8f6df056431e433f757573e81 Mon Sep 17 00:00:00 2001
From: Clayne Robison <clayne.b.robison@intel.com>
Date: Thu, 7 Jun 2018 22:03:44 -0700
Subject: [PATCH 473/610] [Intel MKL] Bootstrapping MKL test infrastructure
 (#19707)

* Bootstrapping MKL test infrastructure

* abandoning run_mkl.sh in mkl folder; using shared version
---
 .../ci_build/linux/mkl/basic-mkl-test.sh      | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100755 tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh

diff --git a/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
new file mode 100755
index 0000000000..10a09a415a
--- /dev/null
+++ b/tensorflow/tools/ci_build/linux/mkl/basic-mkl-test.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Usage: basic_mkl_test.sh
+
+# Helper function to traverse directories up until given file is found.
+function upsearch () {
+  test / == "$PWD" && return || \
+      test -e "$1" && echo "$PWD" && return || \
+      cd .. && upsearch "$1"
+}
+
+# Set up WORKSPACE.
+WORKSPACE="${WORKSPACE:-$(upsearch WORKSPACE)}"
+
+BUILD_TAG=mkl-ci-test CI_BUILD_USER_FORCE_BADNAME=yes ${WORKSPACE}/tensorflow/tools/ci_build/ci_build.sh cpu tensorflow/tools/ci_build/linux/cpu/run_mkl.sh
-- 
GitLab


From 4bc01f8f63074337c846a1b60a4a2b88d420bd56 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Thu, 7 Jun 2018 22:32:00 -0700
Subject: [PATCH 474/610] Upgrade Eigen version. Remove
 eigen_fix_cuda_compilation.patch because the fixes in the patch have been
 incorporated into the Eigen opensource repository with this commit:
 https://bitbucket.org/eigen/eigen/commits/60ab50654998f1cbe2791d49fea94d0ca5ae08a8

PiperOrigin-RevId: 199749536
---
 tensorflow/workspace.bzl                     |  9 +++--
 third_party/eigen_fix_cuda_compilation.patch | 38 --------------------
 2 files changed, 4 insertions(+), 43 deletions(-)
 delete mode 100644 third_party/eigen_fix_cuda_compilation.patch

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index b007d3f597..ce4a009974 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -107,13 +107,12 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "eigen_archive",
       urls = [
-          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
-          "https://bitbucket.org/eigen/eigen/get/6913f0cf7d06.tar.gz",
+          "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/267806ed9b4f.tar.gz",
+          "https://bitbucket.org/eigen/eigen/get/267806ed9b4f.tar.gz",
       ],
-      sha256 = "791b836cacd03e20bae5bdd25f1c4a5505a0a9975ba94a61eb4e2631fbd1d53a",
-      strip_prefix = "eigen-eigen-6913f0cf7d06",
+      sha256 = "ade57357093463cab9e4e51cd5749c81483a75451b1471a3ebc73f9c1d14043b",
+      strip_prefix = "eigen-eigen-267806ed9b4f",
       build_file = clean_dep("//third_party:eigen.BUILD"),
-      patch_file = clean_dep("//third_party:eigen_fix_cuda_compilation.patch")
   )
 
   tf_http_archive(
diff --git a/third_party/eigen_fix_cuda_compilation.patch b/third_party/eigen_fix_cuda_compilation.patch
deleted file mode 100644
index b921a7c31d..0000000000
--- a/third_party/eigen_fix_cuda_compilation.patch
+++ /dev/null
@@ -1,38 +0,0 @@
-diff --git a/Eigen/src/Core/ProductEvaluators.h b/Eigen/src/Core/ProductEvaluators.h
---- a/Eigen/src/Core/ProductEvaluators.h
-+++ b/Eigen/src/Core/ProductEvaluators.h
-@@ -137,7 +137,7 @@ struct Assignment<DstXprType, Product<Lh
-   typename enable_if<(Options==DefaultProduct || Options==AliasFreeProduct)>::type>
- {
-   typedef Product<Lhs,Rhs,Options> SrcXprType;
--  static EIGEN_STRONG_INLINE
-+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-   void run(DstXprType &dst, const SrcXprType &src, const internal::assign_op<Scalar,Scalar> &)
-   {
-     Index dstRows = src.rows();
-@@ -390,7 +390,7 @@ struct generic_product_impl<Lhs,Rhs,Dens
-   typedef typename Product<Lhs,Rhs>::Scalar Scalar;
-   
-   template<typename Dst>
--  static EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void evalTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-   {
-     // Same as: dst.noalias() = lhs.lazyProduct(rhs);
-     // but easier on the compiler side
-@@ -398,14 +398,14 @@ struct generic_product_impl<Lhs,Rhs,Dens
-   }
-   
-   template<typename Dst>
--  static EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void addTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-   {
-     // dst.noalias() += lhs.lazyProduct(rhs);
-     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::add_assign_op<typename Dst::Scalar,Scalar>());
-   }
-   
-   template<typename Dst>
--  static EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void subTo(Dst& dst, const Lhs& lhs, const Rhs& rhs)
-   {
-     // dst.noalias() -= lhs.lazyProduct(rhs);
-     call_assignment_no_alias(dst, lhs.lazyProduct(rhs), internal::sub_assign_op<typename Dst::Scalar,Scalar>());
-- 
GitLab


From 8666eff2359ccacd528dfda404a1f8ae35762542 Mon Sep 17 00:00:00 2001
From: Saurabh Saxena <srbs@google.com>
Date: Thu, 7 Jun 2018 23:42:58 -0700
Subject: [PATCH 475/610] Add checkpointing support for ReshufflingDataset.
 This allows checkpointing input pipelines with
 .shuffle(reshuffle_each_iteration=True[default]) and .list_files().

PiperOrigin-RevId: 199753836
---
 .../contrib/data/python/kernel_tests/BUILD    |   2 +
 .../dataset_serialization_test_base.py        |  12 +-
 .../kernel_tests/shuffle_dataset_op_test.py   | 100 +++++++-
 .../core/kernels/data/shuffle_dataset_op.cc   | 217 ++++++++++++------
 4 files changed, 244 insertions(+), 87 deletions(-)

diff --git a/tensorflow/contrib/data/python/kernel_tests/BUILD b/tensorflow/contrib/data/python/kernel_tests/BUILD
index fd15103870..be834d7dfd 100644
--- a/tensorflow/contrib/data/python/kernel_tests/BUILD
+++ b/tensorflow/contrib/data/python/kernel_tests/BUILD
@@ -462,6 +462,7 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":dataset_serialization_test",
+        "//tensorflow/contrib/data/python/ops:iterator_ops",
         "//tensorflow/contrib/data/python/ops:shuffle_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
@@ -469,6 +470,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:training",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//third_party/py/numpy",
diff --git a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
index 78ecce8f7d..393f08850b 100644
--- a/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
+++ b/tensorflow/contrib/data/python/kernel_tests/dataset_serialization_test_base.py
@@ -467,7 +467,8 @@ class DatasetSerializationTestBase(test.TestCase):
                   ckpt_saved=False,
                   init_before_restore=False,
                   sparse_tensors=False,
-                  verify_exhausted=True):
+                  verify_exhausted=True,
+                  save_checkpoint_at_end=True):
     """Generates elements from input dataset while stopping at break points.
 
     Produces `num_outputs` outputs and saves the state of the iterator in the
@@ -490,6 +491,10 @@ class DatasetSerializationTestBase(test.TestCase):
       sparse_tensors:  Whether dataset is built from SparseTensor(s).
       verify_exhausted: Whether to verify that the iterator has been exhausted
         after producing `num_outputs` elements.
+      save_checkpoint_at_end: Whether to save a checkpoint after producing all
+        outputs. If False, checkpoints are saved each break point but not at the
+        end. Note that checkpoints overwrite each other so there is always only
+        a single checkpoint available. Defaults to True.
 
     Returns:
       A list of `num_outputs` items.
@@ -526,8 +531,9 @@ class DatasetSerializationTestBase(test.TestCase):
           if i == len(break_points) and verify_exhausted:
             with self.assertRaises(errors.OutOfRangeError):
               sess.run(get_next_op)
-          self._save(sess, saver)
-          ckpt_saved = True
+          if save_checkpoint_at_end or i < len(break_points):
+            self._save(sess, saver)
+            ckpt_saved = True
 
     return outputs
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
index bcc644c097..1b67a33f04 100644
--- a/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/shuffle_dataset_op_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.data.python.kernel_tests import dataset_serialization_test_base
+from tensorflow.contrib.data.python.ops import iterator_ops as contrib_iterator_ops
 from tensorflow.contrib.data.python.ops import shuffle_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
+from tensorflow.python.training import saver as saver_lib
 
 
 class ShuffleDatasetSerializationTest(
@@ -50,26 +52,100 @@ class ShuffleDatasetSerializationTest(
     num_repeats = 5
     num_outputs = range_limit * num_repeats
     buffer_sizes = [1, 3, 8, 10, 25, 50]
-    reshuffle_each_iteration = False
     # pylint: disable=cell-var-from-loop
     # pylint: disable=g-long-lambda
-    for buffer_size in buffer_sizes:
-      self.run_core_tests(
-          lambda: self._build_shuffle_dataset(
+    for reshuffle_each_iteration in [True, False]:
+      for buffer_size in buffer_sizes:
+        self.run_core_tests(
+            lambda: self._build_shuffle_dataset(
+                range_limit=range_limit,
+                num_repeats=num_repeats,
+                buffer_size=buffer_size,
+                seed=seed,
+                reshuffle_each_iteration=reshuffle_each_iteration),
+            lambda: self._build_shuffle_dataset(
+                range_limit=range_limit,
+                num_repeats=num_repeats,
+                buffer_size=buffer_size,
+                seed=10,
+                reshuffle_each_iteration=reshuffle_each_iteration),
+            num_outputs)
+    # pylint: enable=cell-var-from-loop
+    # pylint: enable=g-long-lambda
+
+  def testNonDeterministicSeeding(self):
+
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [1, 3, 8, 10, 25, 50]
+    for reshuffle_each_iteration in [True, False]:
+      for buffer_size in buffer_sizes:
+
+        def ds_fn():
+          # pylint: disable=cell-var-from-loop
+          return self._build_shuffle_dataset(
               range_limit=range_limit,
               num_repeats=num_repeats,
               buffer_size=buffer_size,
-              seed=seed,
-              reshuffle_each_iteration=reshuffle_each_iteration),
-          lambda: self._build_shuffle_dataset(
+              seed=None,  # Iterator seeds are generated non-deterministically.
+              reshuffle_each_iteration=reshuffle_each_iteration)
+          # pylint: enable=cell-var-from-loop
+
+        # We checkpoint the initial state of the Dataset so that we can restore
+        # the seeds in the next run. Since the seeding is non-deterministic
+        # the dataset gets initialized with different seeds each time.
+        expected = self.gen_outputs(
+            ds_fn,
+            break_points=[0],
+            num_outputs=num_outputs,
+            ckpt_saved=False,
+            verify_exhausted=False,
+            save_checkpoint_at_end=False)
+        actual = self.gen_outputs(
+            ds_fn,
+            break_points=self.gen_break_points(num_outputs),
+            num_outputs=num_outputs,
+            ckpt_saved=True,
+            verify_exhausted=False)
+        self.match(expected, actual)
+
+  def testMultipleIterators(self):
+    range_limit = 10
+    num_repeats = 5
+    num_outputs = range_limit * num_repeats
+    buffer_sizes = [1, 3, 8, 10, 25, 50]
+
+    for reshuffle_each_iteration in [True, False]:
+      for buffer_size in buffer_sizes:
+
+        def ds_fn():
+          # pylint: disable=cell-var-from-loop
+          return self._build_shuffle_dataset(
               range_limit=range_limit,
               num_repeats=num_repeats,
               buffer_size=buffer_size,
-              seed=10,
-              reshuffle_each_iteration=reshuffle_each_iteration),
-          num_outputs)
-    # pylint: enable=cell-var-from-loop
-    # pylint: enable=g-long-lambda
+              seed=None,  # Iterator seeds are generated non-deterministically.
+              reshuffle_each_iteration=reshuffle_each_iteration)
+          # pylint: enable=cell-var-from-loop
+
+        with ops.Graph().as_default() as g:
+          ds = ds_fn()
+          iterators = [ds.make_one_shot_iterator(), ds.make_one_shot_iterator()]
+          get_next_ops = [it.get_next() for it in iterators]
+          saveables = [
+              contrib_iterator_ops.make_saveable_from_iterator(it)
+              for it in iterators
+          ]
+          for saveable in saveables:
+            ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+          saver = saver_lib.Saver(allow_empty=True)
+          with self.test_session(graph=g) as sess:
+            self._save(sess, saver)
+            expected = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            self._restore(saver, sess)
+            actual = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            self.match(expected, actual)
 
 
 class ShuffleAndRepeatTest(
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index 3438199ebd..b859295fa4 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -61,10 +61,12 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
     }
 
    protected:
-    class Iterator : public DatasetIterator<ShuffleDatasetBase> {
+    template <class T>
+    class Iterator : public DatasetIterator<T> {
      public:
-      explicit Iterator(const Params& params, int64 seed, int64 seed2)
-          : DatasetIterator<ShuffleDatasetBase>(params),
+      explicit Iterator(const typename DatasetIterator<T>::Params& params,
+                        int64 seed, int64 seed2)
+          : DatasetIterator<T>(params),
             input_impl_(nullptr),
             seed_(seed),
             seed2_(seed2),
@@ -85,26 +87,28 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         bool first_call = false;
         if (!input_impl_ && epoch_ == 0) {
           first_call = true;
-          TF_RETURN_IF_ERROR(
-              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+          TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
+              ctx, this->prefix(), &input_impl_));
         }
-        while (input_impl_ && num_elements_ < dataset()->buffer_size_) {
+        while (input_impl_ && num_elements_ < this->dataset()->buffer_size_) {
           if (ctx->env()->NowMicros() >
               ((num_log_entries + 1) * kLogIntervalMicros) + start_micros) {
             num_log_entries++;
             LOG(INFO) << "Filling up shuffle buffer (this may take a while): "
-                      << num_elements_ << " of " << dataset()->buffer_size_;
+                      << num_elements_ << " of "
+                      << this->dataset()->buffer_size_;
           }
           std::vector<Tensor> input_element;
           bool end_of_input_sequence = false;
-          while (dataset()->count_ == -1 || epoch_ < dataset()->count_) {
+          while (this->dataset()->count_ == -1 ||
+                 epoch_ < this->dataset()->count_) {
             TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &input_element,
                                                     &end_of_input_sequence));
             if (!end_of_input_sequence) {
               first_call = false;
               break;
             }
-            if (first_call && dataset()->count_ == -1) {
+            if (first_call && this->dataset()->count_ == -1) {
               // If the first call to GetNext() fails because the end
               // of sequence has been reached, we terminate the
               // iteration immediately. (Otherwise, this iterator
@@ -115,11 +119,11 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
             epoch_++;
             int64 n = slices_.back()->end;
             slices_.emplace_back(new Slice{n, n});
-            TF_RETURN_IF_ERROR(
-                dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+            TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
+                ctx, this->prefix(), &input_impl_));
           }
           if (!end_of_input_sequence) {
-            buffer_[slices_.back()->end % dataset()->buffer_size_] =
+            buffer_[slices_.back()->end % this->dataset()->buffer_size_] =
                 std::move(input_element);
             num_elements_++;
             slices_.back()->end++;
@@ -144,10 +148,11 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           int64 offset =
               Random() % (slices_.front()->end - slices_.front()->start);
           int64 index =
-              (slices_.front()->start + offset) % dataset()->buffer_size_;
+              (slices_.front()->start + offset) % this->dataset()->buffer_size_;
           *out_tensors = std::move(buffer_[index]);
-          std::swap(buffer_[index],
-                    buffer_[slices_.front()->start % dataset()->buffer_size_]);
+          std::swap(
+              buffer_[index],
+              buffer_[slices_.front()->start % this->dataset()->buffer_size_]);
           slices_.front()->start++;
           num_elements_--;
         } else {
@@ -160,40 +165,44 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
      protected:
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
-
         // Save state needed to restore the random number generators.
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_random_samples"),
-                                               num_random_samples_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            this->full_name("num_random_samples"), num_random_samples_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("seed"), seed_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(this->full_name("seed2"), seed2_));
 
         // Save input iterator if it hasn't been exhausted else write
         // "end_of_input_sequence".
         if (!input_impl_) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("end_of_input_sequence"), ""));
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              this->full_name("end_of_input_sequence"), ""));
         } else {
-          TF_RETURN_IF_ERROR(SaveParent(writer, input_impl_));
+          TF_RETURN_IF_ERROR(this->SaveParent(writer, input_impl_));
         }
 
         // Save the epoch counter, buffer, and buffer slices.
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("epoch"), epoch_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("num_elements"), num_elements_));
         TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("slices_size"), slices_.size()));
+            writer->WriteScalar(this->full_name("epoch"), epoch_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("num_elements"),
+                                               num_elements_));
+        TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name("slices_size"),
+                                               slices_.size()));
         for (size_t i = 0; i < slices_.size(); ++i) {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("slices_start_", i)),
+              this->full_name(strings::StrCat("slices_start_", i)),
               slices_[i]->start));
           TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("slices_end_", i)), slices_[i]->end));
+              this->full_name(strings::StrCat("slices_end_", i)),
+              slices_[i]->end));
           for (size_t j = slices_[i]->start; j < slices_[i]->end; ++j) {
-            size_t index = j % dataset()->buffer_size_;
+            size_t index = j % this->dataset()->buffer_size_;
             TF_RETURN_IF_ERROR(writer->WriteScalar(
-                full_name(strings::StrCat("buffer_", index, "_size")),
+                this->full_name(strings::StrCat("buffer_", index, "_size")),
                 buffer_[index].size()));
             for (size_t k = 0; k < buffer_[index].size(); ++k) {
               TF_RETURN_IF_ERROR(writer->WriteTensor(
-                  full_name(strings::StrCat("buffer_", index, "_", k)),
+                  this->full_name(strings::StrCat("buffer_", index, "_", k)),
                   buffer_[index][k]));
             }
           }
@@ -205,51 +214,54 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
-
         // Restore the random number generators.
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("num_random_samples"),
-                                              &num_random_samples_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            this->full_name("num_random_samples"), &num_random_samples_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("seed"), &seed_));
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(this->full_name("seed2"), &seed2_));
         ResetRngs();
 
         // Restore the input iterator if it wasn't already exhausted.
-        if (!reader->Contains(full_name("end_of_input_sequence"))) {
-          TF_RETURN_IF_ERROR(
-              dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(RestoreParent(ctx, reader, input_impl_));
+        if (!reader->Contains(this->full_name("end_of_input_sequence"))) {
+          TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
+              ctx, this->prefix(), &input_impl_));
+          TF_RETURN_IF_ERROR(this->RestoreParent(ctx, reader, input_impl_));
         } else {
           input_impl_.reset();
         }
 
         // Restore the epoch counter, buffer, and buffer slices.
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("epoch"), &epoch_));
         TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("num_elements"), &num_elements_));
+            reader->ReadScalar(this->full_name("epoch"), &epoch_));
+        TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name("num_elements"),
+                                              &num_elements_));
         size_t slices_size;
         {
           int64 temp;
           TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("slices_size"), &temp));
+              reader->ReadScalar(this->full_name("slices_size"), &temp));
           slices_size = static_cast<size_t>(temp);
         }
-        buffer_.reset(new std::vector<Tensor>[dataset()->buffer_size_]);
+        buffer_.reset(new std::vector<Tensor>[this->dataset()->buffer_size_]);
         for (size_t i = 0; i < slices_size; ++i) {
           int64 start;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat("slices_start_", i)), &start));
+              this->full_name(strings::StrCat("slices_start_", i)), &start));
           int64 end;
           TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat("slices_end_", i)), &end));
+              this->full_name(strings::StrCat("slices_end_", i)), &end));
           slices_.emplace_back(new Slice{start, end});
           for (size_t j = start; j < end; ++j) {
-            size_t index = j % dataset()->buffer_size_;
+            size_t index = j % this->dataset()->buffer_size_;
             int64 list_size;
             TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("buffer_", index, "_size")),
+                this->full_name(strings::StrCat("buffer_", index, "_size")),
                 &list_size));
             buffer_[index] = std::vector<Tensor>(list_size);
             for (int k = 0; k < list_size; ++k) {
               TF_RETURN_IF_ERROR(reader->ReadTensor(
-                  full_name(strings::StrCat("buffer_", index, "_", k)),
+                  this->full_name(strings::StrCat("buffer_", index, "_", k)),
                   &buffer_[index][k]));
             }
           }
@@ -289,8 +301,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       mutex mu_;
       std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      const int64 seed_ GUARDED_BY(mu_);
-      const int64 seed2_ GUARDED_BY(mu_);
+      int64 seed_ GUARDED_BY(mu_);
+      int64 seed2_ GUARDED_BY(mu_);
       int64 epoch_ GUARDED_BY(mu_);
       int64 num_elements_ GUARDED_BY(mu_);
       std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
@@ -360,6 +372,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
           generator_(&parent_generator_) {}
 
     string DebugString() const override {
+      mutex_lock l(mu_);
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::ReshufflingDataset");
     }
@@ -370,38 +383,96 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
       int64 iterator_seed2;
       {
         mutex_lock l(mu_);
-        iterator_seed = generator_();
-        iterator_seed2 = generator_();
+        iterator_seed = Random();
+        iterator_seed2 = Random();
       }
-      return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
-          {this, strings::StrCat(prefix, "::Shuffle")}, iterator_seed,
-          iterator_seed2));
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::Shuffle")},
+                       iterator_seed, iterator_seed2));
     }
 
    protected:
+    class Iterator : public ShuffleDatasetBase::Iterator<ReshufflingDataset> {
+     public:
+      explicit Iterator(const Params& params, int64 seed, int64 seed2)
+          : ShuffleDatasetBase::Iterator<ReshufflingDataset>(params, seed,
+                                                             seed2) {}
+
+     protected:
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        mutex_lock l(dataset()->mu_);
+
+        // Save RNG state of Dataset.
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("ds_num_random_samples"),
+                                dataset()->num_random_samples_));
+
+        // Save the Iterator.
+        return ShuffleDatasetBase::Iterator<ReshufflingDataset>::SaveInternal(
+            writer);
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        mutex_lock l(dataset()->mu_);
+
+        // Restore RNG state of Dataset.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("ds_num_random_samples"),
+                               &dataset()->num_random_samples_));
+        dataset()->ResetRngs();
+
+        // Restore the Iterator.
+        return ShuffleDatasetBase::Iterator<
+            ReshufflingDataset>::RestoreInternal(ctx, reader);
+      }
+    };
+
     Status AsGraphDefInternal(OpKernelContext* ctx, DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      return errors::Unimplemented(
-          "Checkpointing ShufflingDataset with reshuffle_each_iteration=true "
-          "is not supported.\n"
-          "If you have a ds.shuffle(buffer_size).repeat(count) in your input "
-          "pipeline, replace it with "
-          "ds.apply(tf.contrib.data.shuffle_and_repeat(buffer_size, count)).\n"
-          "If you iterate over your dataset once, change shuffle(buffer_size) "
-          "to shuffle(buffer_size, reshuffle_each_iteration=False).\n"
-          "If you are using Dataset.list_files(pattern), change it to "
-          "Dataset.list_files(pattern, shuffle=False) and manually shuffle "
-          "the list of files using shuffle_and_repeat as above or using "
-          "ds.shuffle with reshuffle_each_iteration=False.");
+      mutex_lock l(mu_);
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
+      Node* buffer_size = nullptr;
+      Node* seed = nullptr;
+      Node* seed2 = nullptr;
+      AttrValue reshuffle_each_iteration;
+
+      TF_RETURN_IF_ERROR(b->AddScalar(buffer_size_, &buffer_size));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed_, &seed));
+      TF_RETURN_IF_ERROR(b->AddScalar(seed2_, &seed2));
+      b->BuildAttrValue(true, &reshuffle_each_iteration);
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, buffer_size, seed, seed2},  // Inputs
+          {std::make_pair("reshuffle_each_iteration",
+                          reshuffle_each_iteration)},  // Attrs
+          output));
+      return Status::OK();
     }
 
    private:
-    const int64 seed_;
-    const int64 seed2_;
+    random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random() const
+        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      num_random_samples_++;
+      auto out = generator_();
+      return out;
+    }
+
+    void ResetRngs() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      // Reset the generators based on the current seeds.
+      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+      generator_ =
+          random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
+      generator_.Skip(num_random_samples_);
+    }
+
+    mutable int64 seed_ GUARDED_BY(mu_);
+    mutable int64 seed2_ GUARDED_BY(mu_);
     mutable mutex mu_;
     mutable random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
     mutable random::SingleSampleAdapter<random::PhiloxRandom> generator_
         GUARDED_BY(mu_);
+    mutable int64 num_random_samples_ GUARDED_BY(mu_) = 0;
   };
 
   // A dataset that uses the same fixed seed for all iterators created from it.
@@ -421,8 +492,9 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
-          {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
+      return std::unique_ptr<IteratorBase>(
+          new ShuffleDatasetBase::Iterator<ShuffleDatasetBase>(
+              {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
     }
 
    protected:
@@ -504,9 +576,10 @@ class ShuffleAndRepeatDatasetOp : public ShuffleDatasetOpBase {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new ShuffleDatasetBase::Iterator(
-          {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
-          seed2_));
+      return std::unique_ptr<IteratorBase>(
+          new ShuffleDatasetBase::Iterator<ShuffleDatasetBase>(
+              {this, strings::StrCat(prefix, "::ShuffleAndRepeat")}, seed_,
+              seed2_));
     }
 
    protected:
-- 
GitLab


From f6d62598848d1804cf6c834b51c2a9f7c049ba59 Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Fri, 8 Jun 2018 01:53:08 -0700
Subject: [PATCH 476/610] [XLA] Base class for fusing sibling instructions with
 multiple outputs.

PiperOrigin-RevId: 199765487
---
 tensorflow/compiler/xla/service/BUILD         |  13 +
 .../xla/service/multi_output_fusion.cc        | 342 ++++++++++++++++++
 .../xla/service/multi_output_fusion.h         | 160 ++++++++
 3 files changed, 515 insertions(+)
 create mode 100644 tensorflow/compiler/xla/service/multi_output_fusion.cc
 create mode 100644 tensorflow/compiler/xla/service/multi_output_fusion.h

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 29718e057b..6f34703fec 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -1148,6 +1148,19 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "multi_output_fusion",
+    srcs = ["multi_output_fusion.cc"],
+    hdrs = ["multi_output_fusion.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "hlo_creation_utils",
     srcs = ["hlo_creation_utils.cc"],
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.cc b/tensorflow/compiler/xla/service/multi_output_fusion.cc
new file mode 100644
index 0000000000..29f787b86b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.cc
@@ -0,0 +1,342 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/multi_output_fusion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/lib/gtl/flatmap.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+StatusOr<bool> MultiOutputFusion::Run(HloModule* module) {
+  bool changed = false;
+
+  for (auto* computation : module->MakeNonfusionComputations()) {
+    computation_ = computation;
+    reachability_ = computation_->ComputeReachability();
+    candidates_.clear();
+    candidates_index_.clear();
+    all_fusion_candidates_.clear();
+
+    int64 index = 0;
+    for (auto it : computation_->MakeInstructionPostOrder()) {
+      candidates_.emplace_back(it);
+      InsertOrDie(&candidates_index_, it, index++);
+    }
+
+    // Create the initial candidate list for each Node.
+    for (auto& node : candidates_) {
+      HloInstruction* instruction = node.hlo;
+      int64 instruction_id = get_candidate_id(instruction);
+      FusionCandidate& instr_node = candidates_[instruction_id];
+      if (!IsFusible(instruction)) {
+        continue;
+      }
+      all_fusion_candidates_.push_back(instruction);
+
+      std::vector<HloInstruction*> candidates;
+      tensorflow::gtl::FlatSet<HloInstruction*> candidates_set;
+      VLOG(10) << "Looking at instruction: " << instruction->name();
+      for (auto operand : instruction->operands()) {
+        // Filter out the non-interesting instructions -- they
+        // will not generate the savings.
+        if (!IsProfitableOperand(operand)) {
+          VLOG(10) << "Operand not profitable: " << operand->name();
+          continue;
+        }
+        VLOG(10) << "Operand profitable: " << operand->name();
+        for (auto user : operand->users()) {
+          VLOG(10) << "User: " << user->name();
+          if (user == instruction || !IsFusible(user)) {
+            VLOG(10) << "User is not fusible, or is the instruction itself: "
+                     << user->name();
+            continue;
+          }
+          int64 user_id = get_candidate_id(user);
+          if (is_connected(instruction, user)) {
+            VLOG(10) << "User is connected: " << user->name();
+            continue;
+          }
+          if (instruction_id < user_id &&
+              user->opcode() == HloOpcode::kFusion) {
+            VLOG(10) << "User ID for user: " << user->name() << " is "
+                     << user_id << " which is higher than " << instruction_id;
+            continue;
+          }
+          if (!LegalToFuse(instruction, user)) {
+            VLOG(10) << "User not legal to fuse: " << user->name();
+            continue;
+          }
+          if (candidates_set.insert(user).second) {
+            VLOG(10) << "User added to candidate list: " << user->name();
+            candidates.push_back(user);
+          }
+        }
+      }
+
+      // Iterate over candidates rather than candidates_set to avoid
+      // nondeterminism.
+      for (auto candidate : candidates) {
+        int64 profit = GetProfit(instruction, candidate);
+        if (profit > 0) {
+          FusionCandidate& candidate_node =
+              candidates_[get_candidate_id(candidate)];
+          instr_node.fusibles.emplace_back(candidate, profit);
+          candidate_node.fusibles.emplace_back(instruction, profit);
+          worklist_.emplace(instruction, candidate, profit);
+        }
+      }
+    }
+    if (Perform()) {
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+HloInstruction* MultiOutputFusion::Fuse(HloInstruction* instr1,
+                                        HloInstruction* instr2) {
+  HloInstruction* remaining = instr1;
+  HloInstruction* fused = instr2;
+  // Make sure that if only one of the instructions is a fusion, or if only one
+  // of the instructions is a multi-output fusion, it's what will be fused into.
+  //
+  // An invariant is that no bitcast nodes will show up in the middle of a
+  // fusion node. This invariant must hold in order for us to lower it. Given
+  // that, we require that during multi-output fusion, a fusion node ending with
+  // bitcast to preserve its structure as a nested fusion instead being
+  // merged and flattened.
+  if (fused->opcode() == HloOpcode::kFusion &&
+      fused->fused_expression_root()->opcode() != HloOpcode::kBitcast) {
+    std::swap(remaining, fused);
+  }
+  if (fused->IsMultiOutputFusion()) {
+    std::swap(remaining, fused);
+  }
+
+  if (fused->opcode() == HloOpcode::kFusion &&
+      fused->fused_expression_root()->opcode() != HloOpcode::kBitcast) {
+    remaining->MergeFusionInstructionIntoMultiOutput(fused);
+  } else {
+    if (remaining->opcode() == HloOpcode::kFusion &&
+        remaining->fused_expression_root()->opcode() == HloOpcode::kBitcast) {
+      auto parent_computation = remaining->parent();
+      // Create a nested fusion node.
+      auto remaining_nested_fused =
+          parent_computation->AddInstruction(HloInstruction::CreateFusion(
+              remaining->shape(), HloInstruction::FusionKind::kLoop,
+              remaining));
+      TF_CHECK_OK(parent_computation->ReplaceInstruction(
+          remaining, remaining_nested_fused));
+      remaining = remaining_nested_fused;
+    }
+    remaining->FuseInstructionIntoMultiOutput(fused);
+  }
+
+  return remaining;
+}
+
+void MultiOutputFusion::Update(HloInstruction* instr1, HloInstruction* instr2) {
+  HloInstruction* fusion = instr1;
+  HloInstruction* fused = instr2;
+  if (is_fused(instr1)) {
+    fusion = instr2;
+    fused = instr1;
+  }
+
+  // Insert the newly created instruction (if any), to candidates_.
+  for (auto use : fusion->users()) {
+    if (candidates_index_.find(use) == candidates_index_.end()) {
+      int64 index = candidates_.size();
+      candidates_.emplace_back(use);
+      InsertOrDie(&candidates_index_, use, index++);
+    }
+  }
+  FusionCandidate& fusion_node = candidates_[get_candidate_id(fusion)];
+  FusionCandidate& fused_node = candidates_[get_candidate_id(fused)];
+
+  // Update the reachability graph.
+  UpdateReachability(fusion, fused, all_fusion_candidates_,
+                     [this](HloInstruction* instr) { return is_fused(instr); });
+
+  // Update the fusible list for fusion. Variable new_fusibles keeps
+  // track of the new or changed entries.
+  std::vector<std::pair<HloInstruction*, int64>> new_fusibles;
+  tensorflow::gtl::FlatSet<HloInstruction*> in_list;
+  auto it = fusion_node.fusibles.begin();
+  while (it != fusion_node.fusibles.end()) {
+    HloInstruction* instr = it->first;
+    if (is_fused(instr) || is_connected(fusion, instr)) {
+      it = fusion_node.fusibles.erase(it);
+      continue;
+    }
+    in_list.insert(instr);
+    int64 profit = GetProfit(instr, fusion);
+    if (profit > it->second) {
+      it->second = profit;
+      new_fusibles.emplace_back(instr, profit);
+    }
+    ++it;
+  }
+
+  // Fused_node has been fused into fusion_node. Take the fusion candidates
+  // (fusibles) from fused_nodes and add them to the fusion_node's. Filter
+  // out those fusibles that no longer valid (or already in the list).
+  for (const auto& it : fused_node.fusibles) {
+    HloInstruction* instr = it.first;
+    if (instr == fusion || is_fused(instr) || is_connected(fusion, instr)) {
+      continue;
+    }
+    if (in_list.count(instr) > 0) {
+      continue;
+    }
+    int64 profit = GetProfit(instr, fusion);
+    fusion_node.fusibles.emplace_back(instr, profit);
+    new_fusibles.emplace_back(instr, profit);
+  }
+  fused_node.fusibles.clear();
+
+  // Update the worklist_.
+  for (auto it : new_fusibles) {
+    worklist_.emplace(fusion, it.first, it.second);
+  }
+}
+
+bool MultiOutputFusion::LegalToFuse(HloInstruction* instr1,
+                                    HloInstruction* instr2) {
+  if (instr1 == instr2) {
+    return false;
+  }
+  if (instr1->opcode() != HloOpcode::kFusion) {
+    return false;
+  }
+
+  // Fusing nodes with 0 user makes no sense and the rest of the implementation
+  // doesn't support it either.
+  if (instr1->user_count() == 0 || instr2->user_count() == 0) {
+    return false;
+  }
+
+  // Check if the users of multioutput fusion is not a get-tuple-element.
+  // If this is the case, we bail out because the transformation assumes
+  // the users are get-tuple-element.
+  auto multioutput_user_is_not_gte = [](HloInstruction* instr) {
+    if (!instr->IsMultiOutputFusion()) {
+      return false;
+    }
+    for (auto user : instr->users()) {
+      if (user->opcode() != HloOpcode::kGetTupleElement) {
+        return true;
+      }
+    }
+    return false;
+  };
+  if (multioutput_user_is_not_gte(instr1) ||
+      multioutput_user_is_not_gte(instr2)) {
+    return false;
+  }
+
+  if (is_connected(instr1, instr2)) {
+    return false;
+  }
+  if (!ShapesCompatibleForFusion(instr1, instr2)) {
+    return false;
+  }
+
+  return true;
+}
+
+void MultiOutputFusion::UpdateReachability(
+    HloInstruction* instr1, HloInstruction* instr2,
+    tensorflow::gtl::ArraySlice<HloInstruction*> instrs_to_update,
+    const std::function<bool(HloInstruction*)>& skip) {
+  for (auto instr : instrs_to_update) {
+    if (skip != nullptr && skip(instr)) {
+      continue;
+    }
+    if (reachability_->IsReachable(instr2, instr) &&
+        reachability_->IsReachable(instr1, instr)) {
+      // If a candidate was already reachable by both, no update needed.
+      continue;
+    }
+    if (reachability_->IsReachable(instr2, instr)) {
+      reachability_->FastSetReachabilityToUnion({instr, instr1}, instr);
+    }
+    if (reachability_->IsReachable(instr1, instr)) {
+      reachability_->FastSetReachabilityToUnion({instr, instr2}, instr);
+    }
+  }
+}
+
+bool MultiOutputFusion::Perform() {
+  int changed = false;
+  // Pick the top candidate from queue and try to merge.
+  while (!worklist_.empty()) {
+    if (fuel_ <= 0) {
+      VLOG(2) << "No fusing: run out of fuel.";
+      break;
+    }
+    ToBeFused candidate = worklist_.top();
+    worklist_.pop();
+
+    HloInstruction* instr1 = candidate.instr1;
+    HloInstruction* instr2 = candidate.instr2;
+
+    if (is_fused(instr1) || is_fused(instr2)) {
+      continue;
+    }
+
+    VLOG(1) << "Considering candidate profit_score=" << candidate.score
+            << "\n\t\tinstr1 = " << instr1->ToString()
+            << "\n\t\tinstr2 = " << instr2->ToString();
+
+    if (LegalToFuse(instr1, instr2)) {
+      VLOG(1) << "Fuse!";
+      VLOG(2) << "Before multi_output_fusion:";
+      VLOG(2) << "instr1: " << instr1->ToString();
+      VLOG(2) << "\n"
+              << instr1->fused_instructions_computation()->ToString(
+                     HloPrintOptions().set_indent_amount(1));
+      VLOG(2) << "instr2: " << instr2->ToString();
+      if (instr2->opcode() == HloOpcode::kFusion) {
+        VLOG(2) << "\n"
+                << instr2->fused_instructions_computation()->ToString(
+                       HloPrintOptions().set_indent_amount(1));
+      }
+      HloInstruction* ret = Fuse(instr1, instr2);
+      set_is_fused(ret == instr1 ? instr2 : instr1);
+      Update(instr1, instr2);
+      changed = true;
+      VLOG(2) << "After fusion, \t this: " << ret->name() << "\n"
+              << ret->fused_instructions_computation()->ToString(
+                     HloPrintOptions().set_indent_amount(1));
+      auto users = ret->users();
+      --fuel_;
+    }
+  }
+  if (DoProducerConsumerMultiOutputFusion(computation_)) {
+    changed = true;
+  }
+  return changed;
+}
+
+bool MultiOutputFusion::DoProducerConsumerMultiOutputFusion(
+    HloComputation* /*computation*/) {
+  return false;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/multi_output_fusion.h b/tensorflow/compiler/xla/service/multi_output_fusion.h
new file mode 100644
index 0000000000..cfdf83cfe8
--- /dev/null
+++ b/tensorflow/compiler/xla/service/multi_output_fusion.h
@@ -0,0 +1,160 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
+
+#include <queue>
+#include <vector>
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+
+namespace xla {
+
+// This class implements the fusing of sibling fusion instructions that sharing
+// common operands.
+// It constructs the following associated data structures.
+//  (1) candidates_: stores the instruction and the set of instructions it can
+//      fuse to.
+//  (2) candidates_index_: maps instruction to id.
+//  (3) reachability_: reachability map in this computation.
+//  (4) all_fusion_candidates_: the vector of candidate instructions.
+//  (5) worklist_: a priority queue that contains pairs of instructions to be
+//      fused and their fusion profit scores.
+//
+//  Function Perform() applies the optimization. It picks up the most profitable
+//  pair in the worklist_, check if it's legal to fuse and fuse the pair.
+//  After fusion, it updates the associated structure such as reachability_,
+//  candidates_ and worklist_.
+//  Note that the reachability map is updated based on the original computation.
+//  This works because the reachability is monotonically increasing with
+//  instruction fusion.
+class MultiOutputFusion : public HloPassInterface {
+ public:
+  MultiOutputFusion(int64 fuel) : fuel_(fuel) {}
+
+  tensorflow::StringPiece name() const override {
+    return "multi_output_fusion";
+  }
+
+  // Run multi-output fusion on the given module. Returns whether the module
+  // was changed.
+  StatusOr<bool> Run(HloModule* module) override;
+
+ protected:
+  // Main entry for the optimization. Returns true if the optimization happens.
+  bool Perform();
+
+  // Test if instr1 and instr2 have the compatible shapes that can be legally
+  // fused.
+  virtual bool ShapesCompatibleForFusion(HloInstruction* instr1,
+                                         HloInstruction* instr2) = 0;
+
+  // Whether the instruction is a candidate for fusion.
+  virtual bool IsFusible(HloInstruction* instr) = 0;
+
+  // This function estimates the savings by merging instr1 and instr2 into one
+  // multi-output fusion instruction.
+  virtual int64 GetProfit(HloInstruction* instr1, HloInstruction* instr2) = 0;
+
+  // Whether fusing the instruction can reduce cost.
+  virtual bool IsProfitableOperand(HloInstruction* instr) = 0;
+
+  // Test if it's legal to fuse instr1 and instr2 into one fusion instruction.
+  virtual bool LegalToFuse(HloInstruction* instr1, HloInstruction* instr2);
+
+  // Update the reachability map after fusing instr1 and instr2.
+  void UpdateReachability(
+      HloInstruction* instr1, HloInstruction* instr2,
+      tensorflow::gtl::ArraySlice<HloInstruction*> instrs_to_update,
+      const std::function<bool(HloInstruction*)>& skip = nullptr);
+
+  // Hook for multi-output fusion along producer-consumer edges.
+  // Returns whether any instructions were fused.
+  //
+  // TODO(b/80420762): Perform producer-consumer multi-output fusion in
+  // InstructionFusion instead.
+  virtual bool DoProducerConsumerMultiOutputFusion(HloComputation* computation);
+
+ private:
+  // Fuse HloInstrctuion instr1 and instr2 and return the fused instruction.
+  // The other instruction is removed from its parent computation.
+  HloInstruction* Fuse(HloInstruction* instr1, HloInstruction* instr2);
+
+  // Update the internal data structures after instr1 and instr2 are fused into
+  // one fusion instruction.
+  void Update(HloInstruction* instr1, HloInstruction* instr2);
+
+  // Optimization fuel is a compiler debugging technique that makes an
+  // optimization pass stop what it is doing after having made N changes to the
+  // program, where N is the fuel. By varying N, this can be used to find the
+  // first single change that makes a test fail.
+  int64 fuel_;
+
+  // Computation for the pass.
+  HloComputation* computation_;
+
+  // An internal data structure for each instruction in current computation.
+  // When an instruction is removed, member 'hlo' is set to nullptr.
+  struct FusionCandidate {
+    HloInstruction* hlo;
+    std::list<std::pair<HloInstruction*, int64>> fusibles;
+    explicit FusionCandidate(HloInstruction* hlo) : hlo(hlo) {}
+  };
+  std::vector<FusionCandidate> candidates_;
+
+  // A map that maps an instruction to the index_.
+  tensorflow::gtl::FlatMap<HloInstruction*, int> candidates_index_;
+
+  // The reachability map of current computation.
+  std::unique_ptr<HloReachabilityMap> reachability_;
+
+  // This stores all the candidate instructions in current computation.
+  std::vector<HloInstruction*> all_fusion_candidates_;
+
+  // The pair of candidates to be fused and the profit score.
+  struct ToBeFused {
+    HloInstruction* instr1;
+    HloInstruction* instr2;
+    int64 score;
+    ToBeFused(HloInstruction* instr1, HloInstruction* instr2, int64 score)
+        : instr1(instr1), instr2(instr2), score(score) {}
+    bool operator<(const ToBeFused& rhs) const { return score < rhs.score; }
+  };
+  std::priority_queue<ToBeFused> worklist_;
+
+  int64 get_candidate_id(HloInstruction* instr) {
+    return FindOrDie(candidates_index_, instr);
+  }
+
+  bool is_fused(HloInstruction* instr) {
+    return candidates_[get_candidate_id(instr)].hlo == nullptr;
+  }
+
+  void set_is_fused(HloInstruction* instr) {
+    candidates_[get_candidate_id(instr)].hlo = nullptr;
+  }
+
+  bool is_connected(HloInstruction* instr1, HloInstruction* instr2) {
+    return reachability_->IsConnected(instr1, instr2);
+  }
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_MULTI_OUTPUT_FUSION_H_
-- 
GitLab


From c2493ed5aa9eaf375d88331c7cdb70e428614dc8 Mon Sep 17 00:00:00 2001
From: Akshay Agrawal <akshayka@google.com>
Date: Fri, 8 Jun 2018 02:22:02 -0700
Subject: [PATCH 477/610] Make tfe.py_func once differentiable.

With this change, it is now possible to embed differentiable eager code --- running on either CPU or GPU --- in graphs. Higher-order derivatives are not yet supported.

PiperOrigin-RevId: 199768301
---
 .../python/kernel_tests/py_func_test.py       |  81 ++++++++++-
 tensorflow/python/ops/script_ops.py           | 128 +++++++++++++-----
 2 files changed, 175 insertions(+), 34 deletions(-)

diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index dc7399f040..824610323c 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -26,6 +26,7 @@ from six.moves import queue
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
@@ -34,6 +35,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import script_ops
@@ -438,7 +440,7 @@ class PyFuncTest(test.TestCase):
         c = constant_op.constant([1.], dtypes.float32)
         _ = script_ops.py_func(lambda x: x + 1, [c], [dtypes.float32])
         _ = script_ops.eager_py_func(lambda x: x + 1, [c], [dtypes.float32])
-    self.assertTrue(script_ops._py_funcs.size() < 100)
+    self.assertLess(script_ops._py_funcs.size(), 100)
 
   # ----- Tests for eager_py_func -----
   @test_util.run_in_graph_and_eager_modes()
@@ -515,8 +517,7 @@ class PyFuncTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes()
   def testEagerReturningVariableRaisesError(self):
     def return_variable():
-      variable = resource_variable_ops.ResourceVariable(0.0)
-      return variable
+      return resource_variable_ops.ResourceVariable(0.0)
 
     with self.assertRaisesRegexp(errors.UnknownError,
                                  "Attempting to return a variable"):
@@ -524,6 +525,80 @@ class PyFuncTest(test.TestCase):
           return_variable, inp=[], Tout=dtypes.float32)
       self.evaluate(output)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerGradientTape(self):
+
+    def f(x):
+      return x**2
+
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      y = script_ops.eager_py_func(f, inp=[x], Tout=dtypes.float32)
+    dy_dx = tape.gradient(y, x)
+    self.assertEqual(self.evaluate(dy_dx), 6.0)
+
+  def testEagerGradientGraph(self):
+
+    def f(x):
+      return x**2
+
+    x = constant_op.constant(3.0)
+    y = script_ops.eager_py_func(f, inp=[x], Tout=dtypes.float32)
+    dy_dx = gradients_impl.gradients(y, x)[0]
+    self.assertEqual(self.evaluate(dy_dx), 6.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testEagerGradientTapeMultipleArgs(self):
+
+    def f(x, y):
+      return x**2 + y**2
+
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tape:
+      tape.watch(x)
+      tape.watch(y)
+      z = script_ops.eager_py_func(f, inp=[x, y], Tout=dtypes.float32)
+
+    dz_dx, dz_dy = tape.gradient(z, [x, y])
+    self.assertEqual(self.evaluate(dz_dx), 6.0)
+    self.assertEqual(self.evaluate(dz_dy), 8.0)
+
+  def testEagerGradientGraphMultipleArgs(self):
+
+    def f(x, y):
+      return x**2 + y**2
+
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(4.0)
+    z = script_ops.eager_py_func(f, inp=[x, y], Tout=dtypes.float32)
+
+    dz_dx, dz_dy = gradients_impl.gradients(z, [x, y])
+    self.assertEqual(self.evaluate(dz_dx), 6.0)
+    self.assertEqual(self.evaluate(dz_dy), 8.0)
+
+  def testEagerGradientGraphLogHuber(self):
+
+    def log_huber(x, m):
+      if math_ops.abs(x) <= m:
+        return x**2
+      else:
+        return m**2 * (1 - 2 * math_ops.log(m) + math_ops.log(x**2))
+
+    x = array_ops.placeholder(dtypes.float32)
+    m = array_ops.placeholder(dtypes.float32)
+
+    y = script_ops.eager_py_func(
+        func=log_huber, inp=[x, m], Tout=dtypes.float32)
+    dy_dx = gradients_impl.gradients(y, x)[0]
+
+    with self.test_session() as sess:
+      # Takes the first branch of log_huber.
+      y, dy_dx = sess.run([y, dy_dx], feed_dict={x: 1.0, m: 2.0})
+      self.assertEqual(y, 1.0)
+      self.assertEqual(dy_dx, 2.0)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/script_ops.py b/tensorflow/python/ops/script_ops.py
index f87c5dc5e3..128b43a7ae 100644
--- a/tensorflow/python/ops/script_ops.py
+++ b/tensorflow/python/ops/script_ops.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-
 """Script Language Operators. See the @{$python/script_ops} guide."""
 
 # pylint: disable=g-bad-name
@@ -29,30 +28,54 @@ import numpy as np
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import gen_script_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
+# Map from EagerPyFunc token to tuple (tape, eager args, eager outputs);
+# used for differentiation.
+tape_cache = {}
+
 
 class EagerFunc(object):
   """A wrapper for a function owned by an EagerPyFunc."""
 
-  def __init__(self, func, Tout):
+  def __init__(self, func, Tout, is_grad_func):
     """Constructs an EagerFunc.
 
     Args:
       func: The function to wrap.
       Tout: A list of datatypes for the output; an empty list if the output is
             None.
+      is_grad_func: Whether this EagerFunc is the gradient of another
+        EagerPyFunc.
     """
     self._func = func
     self._out_dtypes = Tout
+    self._is_grad_func = is_grad_func
 
   def _convert(self, value, dtype):
+    """Converts `value` to a tensor of type `dtype`, with error checking.
+
+    Args:
+      value: The tensor to convert.
+      dtype: The desired dtype.
+
+    Returns:
+      A tensor of type `dtype`, or a zeros tensor if value is None and
+      this function is in fact a grdient function.
+
+    Raises:
+      RuntimeError: if `value` is a variable.
+    """
+
     if isinstance(value, resource_variable_ops.ResourceVariable):
       raise RuntimeError(
           "Attempting to return a variable from an eagerly executed py_func. "
@@ -60,22 +83,40 @@ class EagerFunc(object):
           "be returned; to return the value of a variable, make sure to obtain "
           "the Tensor backing it by calling `.read_value()` on the variable in "
           "question: %s" % value)
+    if value is None and self._is_grad_func:
+      # Gradient functions may legitimately return a list that contains
+      # both Tensors and Python Nones. Unfortuantely this breaks the
+      # OpKernel, so for now we replace None objects with zeros, which is
+      # mathematically correct but will prevent short-circuiting gradient
+      # computations.
+      #
+      # TODO(akshayka): Make it possible to return a list of both Tensors and
+      # Nones from an EagerPyFunc.
+      return constant_op.constant(0.0, dtype=dtype)
     return ops.convert_to_tensor(value, dtype=dtype)
 
-  def __call__(self, on_gpu, args):
+  def __call__(self, on_gpu, token, args):
     """Passes `args` to `self._func`, which is executed eagerly."""
+
     with context.eager_mode():
-      ret = self._func(*args)
-      maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
-      if isinstance(ret, (tuple, list)):
-        return [
-            maybe_copy_to_gpu(self._convert(x, dtype=dtype))
-            for (x, dtype) in zip(ret, self._out_dtypes)
-        ]
-      elif ret is None:
-        return ret
-      else:
-        return maybe_copy_to_gpu(self._convert(ret, dtype=self._out_dtypes[0]))
+      with backprop.GradientTape() as tape:
+        for tensor in args:
+          tape.watch(tensor)
+        ret = self._func(*args)
+        # NB: The tape needs to watch copies across devices.
+        maybe_copy_to_gpu = lambda x: x if not on_gpu else x.gpu()
+        if isinstance(ret, (tuple, list)):
+          outputs = [
+              maybe_copy_to_gpu(self._convert(x, dtype=dtype))
+              for (x, dtype) in zip(ret, self._out_dtypes)
+          ]
+        elif ret is None:
+          outputs = None
+        else:
+          outputs = maybe_copy_to_gpu(
+              self._convert(ret, dtype=self._out_dtypes[0]))
+      tape_cache[compat.as_bytes(token)] = (tape, args, outputs)
+      return outputs
 
 
 class FuncRegistry(object):
@@ -149,7 +190,14 @@ class FuncRegistry(object):
     if func is None:
       raise ValueError("callback %s is not found" % token)
     if isinstance(func, EagerFunc):
-      return func(on_gpu, args)
+      # NB: Different invocations of the same py_func will share the same
+      # token, and the entries they stash in the tape_cache will collide.
+      # In practice, when executing a graph, this should only happen if
+      # the py_func is in a while_loop whose iterations are run in parallel
+      # or if the graph is being driven by concurrent session.run() calls.
+      #
+      # TODO(akshayka): Key the tape cache in a thread-safe way.
+      return func(on_gpu, token, args)
     else:
       ret = func(*args)
       # Strings seem to lead to a memory leak here if they're not wrapped in a
@@ -193,7 +241,8 @@ class CleanupFunc(object):
       _py_funcs.remove(self._token)
 
 
-def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
+def _internal_py_func(func, inp, Tout, stateful=None, eager=False,
+                      is_grad_func=False, name=None):
   """See documentation for py_func and eager_py_func."""
 
   is_list_or_tuple = False
@@ -203,7 +252,7 @@ def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
     Tout = [Tout]
 
   if eager:
-    func = EagerFunc(func, Tout)
+    func = EagerFunc(func, Tout, is_grad_func)
 
   token = _py_funcs.insert(func)
   # We tie the registered function's lifetime with the current default graph,
@@ -242,34 +291,55 @@ def _internal_py_func(func, inp, Tout, stateful=None, eager=False, name=None):
   return result if is_list_or_tuple else result[0]
 
 
+# TODO(akshayka): Implement higher-order derivatives.
+@ops.RegisterGradient("EagerPyFunc")
+def _EagerPyFuncGrad(op, dy):
+  """Computes the gradient of an EagerPyFunc."""
+
+  token = op.get_attr("token")
+
+  def eagerly_executed_grad(dy):
+    tape, eager_inputs, eager_outputs = tape_cache.pop(compat.as_bytes(token))
+    return tape.gradient(eager_outputs, eager_inputs, output_gradients=dy)
+
+  with ops.control_dependencies(op.outputs):
+    return _internal_py_func(
+        func=eagerly_executed_grad,
+        inp=[dy] if isinstance(dy, ops.Tensor) else dy,
+        Tout=[tensor.dtype for tensor in op.inputs],
+        eager=True, is_grad_func=True)
+
+
 def eager_py_func(func, inp, Tout, name=None):
   """Wraps a python function into a TensorFlow op that executes it eagerly.
 
   This function allows expressing computations in a TensorFlow graph as
   Python functions. In particular, it wraps a Python function `func`
-  in a TensorFlow operation that executes it with eager exeuction enabled. As a
-  consequence, `tf.contrib.eager.py_func` makes it possible to express control
-  flow using Python constructs (`if`, `while`, `for`, etc.), instead of
-  TensorFlow control flow constructs (@{tf.cond}, @{tf.while_loop}). For
-  example, you might use `tf.contrib.eager.py_func` to implement the log huber
-  function:
+  in a once-differentiable TensorFlow operation that executes it with eager
+  exeuction enabled. As a consequence, `tf.contrib.eager.py_func` makes it
+  possible to express control flow using Python constructs (`if`, `while`,
+  `for`, etc.), instead of TensorFlow control flow constructs (@{tf.cond},
+  @{tf.while_loop}). For example, you might use `tf.contrib.eager.py_func` to
+  implement the log huber function:
 
   ```python
   def log_huber(x, m):
     if tf.abs(x) <= m:
-      return x ** 2
+      return x**2
     else:
-      return m ** 2 * (1 - 2 * tf.log(m) + tf.log(x ** 2))
+      return m**2 * (1 - 2 * tf.log(m) + tf.log(x**2))
 
   x = tf.placeholder(tf.float32)
   m = tf.placeholder(tf.float32)
 
   y = tf.contrib.eager.py_func(func=log_huber, inp=[x, m], Tout=tf.float32)
+  dy_dx = tf.gradients(y, x)[0]
 
   with tf.Session() as sess:
     # The session executes `log_huber` eagerly. Given the feed values below,
-    # it will take the second branch, so `output` evaluates to 7.24372.
-    output = sess.run(y, feed_dict={x: 3.0, m: 2.0})
+    # it will take the first branch, so `y` evaluates to 1.0 and
+    # `dy_dx` evaluates to 2.0.
+    y, dy_dx = sess.run([y, dy_dx], feed_dict={x: 1.0, m: 2.0})
   ```
 
   You can also use `tf.contrib.eager.py_func` to debug your models at runtime
@@ -288,10 +358,6 @@ def eager_py_func(func, inp, Tout, name=None):
   that take Tensors as inputs, execute TensorFlow operations in their bodies,
   and return Tensors as outputs.
 
-  `tf.contrib.eager.py_func` is not differentiable, though a gradient may be
-  implemented in the future; if you would like to differentiate through it,
-  please file an issue on Github.
-
   Like @{tf.py_func}, `tf.contrib.eager.py_func` has the following limitations
   with respect to serialization and distribution:
 
-- 
GitLab


From 16c1d25110e48b8cecbf61ea8e15a7c9da26dd83 Mon Sep 17 00:00:00 2001
From: Alexandre Passos <apassos@google.com>
Date: Fri, 8 Jun 2018 02:49:33 -0700
Subject: [PATCH 478/610] Removes error message from queues in eager (leaves
 the one in queuerunners).

There's no real reason to not support queues in eager for people using them
without using queue runners.

PiperOrigin-RevId: 199770626
---
 .../common_runtime/eager/kernel_and_device.cc |  1 +
 .../common_runtime/eager/kernel_and_device.h  |  6 +++
 .../python/kernel_tests/fifo_queue_test.py    | 20 ++++++--
 tensorflow/python/ops/data_flow_ops.py        | 46 +++++--------------
 4 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 2a43a31c02..b410ea175b 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -79,6 +79,7 @@ Status KernelAndDevice::Run(std::vector<Tensor>* input_tensors,
   params.function_library = flib_;
   params.slice_reader_cache = &slice_reader_cache_;
   params.rendezvous = rendez_;
+  params.cancellation_manager = &cm_;
   if (stats != nullptr) {
     params.track_allocations = true;
   }
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index f78d197fd5..c41a0972b1 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -76,6 +77,11 @@ class KernelAndDevice {
   const DataTypeVector& output_dtypes() { return output_dtypes_; }
 
  private:
+  // TODO(apassos) Consider a shared cancellation manager. Note that this
+  // cancellation manager is not useful to actually cancel anything, and is
+  // provided here only for the few kernels which can't handle one being
+  // missing.
+  CancellationManager cm_;
   std::unique_ptr<OpKernel> kernel_;
   Device* device_;
   FunctionLibraryRuntime* flib_;
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index ce73e7ad3e..14a336c688 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -125,12 +126,21 @@ class FIFOQueueTest(test.TestCase):
       q.enqueue_many([[1, 2, 3, 4], [[1, 1], [2, 2], [3, 3], [4, 4]]]).run()
       self.assertEqual(4, q.size().eval())
 
+  @test_util.run_in_graph_and_eager_modes()
   def testMultipleDequeues(self):
-    with self.test_session() as session:
-      q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
-      q.enqueue_many([[1, 2, 3]]).run()
-      a, b, c = session.run([q.dequeue(), q.dequeue(), q.dequeue()])
-      self.assertAllEqual(set([1, 2, 3]), set([a, b, c]))
+    q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
+    self.evaluate(q.enqueue_many([[1, 2, 3]]))
+    a, b, c = self.evaluate([q.dequeue(), q.dequeue(), q.dequeue()])
+    self.assertAllEqual(set([1, 2, 3]), set([a, b, c]))
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testQueuesDontShare(self):
+    q = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
+    self.evaluate(q.enqueue(1))
+    q2 = data_flow_ops.FIFOQueue(10, [dtypes_lib.int32], shapes=[()])
+    self.evaluate(q2.enqueue(2))
+    self.assertAllEqual(self.evaluate(q2.dequeue()), 2)
+    self.assertAllEqual(self.evaluate(q.dequeue()), 1)
 
   def testEnqueueDictWithoutNames(self):
     with self.test_session():
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index 62c5adc385..abf597ca55 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -35,6 +35,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_data_flow_ops import *
@@ -129,11 +130,6 @@ class QueueBase(object):
   @{tf.RandomShuffleQueue} for concrete
   implementations of this class, and instructions on how to create
   them.
-
-  @compatibility(eager)
-  Queues are not compatible with eager execution. Instead, please
-  use `tf.data` to get data into your model.
-  @end_compatibility
   """
 
   def __init__(self, dtypes, shapes, names, queue_ref):
@@ -157,12 +153,7 @@ class QueueBase(object):
 
     Raises:
       ValueError: If one of the arguments is invalid.
-      RuntimeError: If eager execution is enabled.
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "Queues are not supported when eager execution is enabled. "
-          "Instead, please use tf.data to get data into your model.")
     self._dtypes = dtypes
     if shapes is not None:
       if len(shapes) != len(dtypes):
@@ -179,6 +170,8 @@ class QueueBase(object):
     self._queue_ref = queue_ref
     if context.executing_eagerly():
       self._name = context.context().scope_name
+      self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
+          queue_ref, None)
     else:
       self._name = self._queue_ref.op.name.split("/")[-1]
 
@@ -605,6 +598,11 @@ class QueueBase(object):
     else:
       return gen_data_flow_ops.queue_size(self._queue_ref, name=name)
 
+def _shared_name(shared_name):
+  if context.executing_eagerly():
+    return str(ops.uid())
+  return shared_name
+
 
 @tf_export("RandomShuffleQueue")
 class RandomShuffleQueue(QueueBase):
@@ -612,11 +610,6 @@ class RandomShuffleQueue(QueueBase):
 
   See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @compatibility(eager)
-  Queues are not compatible with eager execution. Instead, please
-  use `tf.data` to get data into your model.
-  @end_compatibility
   """
 
   def __init__(self,
@@ -690,7 +683,7 @@ class RandomShuffleQueue(QueueBase):
         min_after_dequeue=min_after_dequeue,
         seed=seed1,
         seed2=seed2,
-        shared_name=shared_name,
+        shared_name=_shared_name(shared_name),
         name=name)
 
     super(RandomShuffleQueue, self).__init__(dtypes, shapes, names, queue_ref)
@@ -702,11 +695,6 @@ class FIFOQueue(QueueBase):
 
   See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @compatibility(eager)
-  Queues are not compatible with eager execution. Instead, please
-  use `tf.data` to get data into your model.
-  @end_compatibility
   """
 
   def __init__(self,
@@ -752,7 +740,7 @@ class FIFOQueue(QueueBase):
         component_types=dtypes,
         shapes=shapes,
         capacity=capacity,
-        shared_name=shared_name,
+        shared_name=_shared_name(shared_name),
         name=name)
 
     super(FIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
@@ -767,11 +755,6 @@ class PaddingFIFOQueue(QueueBase):
 
   See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @compatibility(eager)
-  Queues are not compatible with eager execution. Instead, please
-  use `tf.data` to get data into your model.
-  @end_compatibility
   """
 
   def __init__(self,
@@ -831,7 +814,7 @@ class PaddingFIFOQueue(QueueBase):
         component_types=dtypes,
         shapes=shapes,
         capacity=capacity,
-        shared_name=shared_name,
+        shared_name=_shared_name(shared_name),
         name=name)
 
     super(PaddingFIFOQueue, self).__init__(dtypes, shapes, names, queue_ref)
@@ -843,11 +826,6 @@ class PriorityQueue(QueueBase):
 
   See @{tf.QueueBase} for a description of the methods on
   this class.
-
-  @compatibility(eager)
-  Queues are not compatible with eager execution. Instead, please
-  use `tf.data` to get data into your model.
-  @end_compatibility
   """
 
   def __init__(self,
@@ -899,7 +877,7 @@ class PriorityQueue(QueueBase):
         component_types=types,
         shapes=shapes,
         capacity=capacity,
-        shared_name=shared_name,
+        shared_name=_shared_name(shared_name),
         name=name)
 
     priority_dtypes = [_dtypes.int64] + types
-- 
GitLab


From 1c241ba791f578a67c80e932cbbb06b5af5ca81a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 04:12:07 -0700
Subject: [PATCH 479/610] Fix RemoveUnusedNodes generating invalid graphs for
 PlaceholderWithDefault inputs

PiperOrigin-RevId: 199776409
---
 .../graph_transforms/fold_constants_lib.cc    | 26 +++++++++++
 .../graph_transforms/fold_constants_test.cc   | 46 -------------------
 2 files changed, 26 insertions(+), 46 deletions(-)

diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 85660f94a8..f858411876 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -117,6 +117,31 @@ Status ReplaceSendRecvs(const GraphDef& original_graph_def,
   return Status::OK();
 }
 
+Status RewriteInputsAsPlaceholders(const TransformFuncContext& context,
+                                   GraphDef* graph_def) {
+  std::unordered_set<string> input_names;
+  for (const string& input_name : context.input_names) {
+    input_names.insert(ParseTensorName(input_name).first.ToString());
+  }
+
+  for (NodeDef& node : *graph_def->mutable_node()) {
+    if (input_names.find(node.name()) == input_names.end()) {
+      continue;
+    }
+    if (node.op() == "PlaceholderWithDefault") {
+      node.set_op("Placeholder");
+      node.clear_input();
+    } else if (node.op() != "Placeholder") {
+      return errors::InvalidArgument(
+          "Input '", node.name(),
+          "' was expected to be a Placeholder or PlaceholderWithDefault op, "
+          "but was ",
+          node.op());
+    }
+  }
+  return Status::OK();
+}
+
 Status RemoveUnusedNodes(const GraphDef& input_graph_def,
                          const TransformFuncContext& context,
                          GraphDef* output_graph_def) {
@@ -165,6 +190,7 @@ Status RemoveUnusedNodes(const GraphDef& input_graph_def,
       input_graph_def,
       [&](const NodeDef& node) { return used_nodes.count(node.name()) > 0; },
       output_graph_def);
+  TF_RETURN_IF_ERROR(RewriteInputsAsPlaceholders(context, output_graph_def));
 
   return Status::OK();
 }
diff --git a/tensorflow/tools/graph_transforms/fold_constants_test.cc b/tensorflow/tools/graph_transforms/fold_constants_test.cc
index a082399a87..dcdc3c2906 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_test.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_test.cc
@@ -330,48 +330,6 @@ class ConstantFoldingTest : public ::testing::Test {
     EXPECT_EQ(0, node_map.count("unused"));
   }
 
-  void TestRemoveUnusedNodesMultipleOutputs() {
-    using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
-    auto root = tensorflow::Scope::NewRootScope();
-
-    //    a    b
-    //     \  /
-    //    shape_n
-    //     \  /
-    //       c
-    auto a = Placeholder(root.WithOpName("a"), DT_FLOAT);
-    auto b = Placeholder(root.WithOpName("b"), DT_FLOAT);
-    auto shape_n = ShapeN(root.WithOpName("shape_n"), {Output(a), Output(b)});
-    auto c = Add(root.WithOpName("c"), shape_n[0], shape_n[1]);
-
-    GraphDef graph_def;
-    TF_ASSERT_OK(root.ToGraphDef(&graph_def));
-    GraphDef result_graph_def;
-    TF_ASSERT_OK(graph_transforms::RemoveUnusedNodes(
-        graph_def, {{shape_n[0].name()}, {"c"}}, &result_graph_def));
-
-    // Only one output of shape_n node is fed input. Hence the graph search
-    // should propagate to inputs of shape_n. Nothing to remove here.
-    std::map<string, const NodeDef*> node_map;
-    graph_transforms::MapNamesToNodes(result_graph_def, &node_map);
-    EXPECT_EQ(1, node_map.count("a"));
-    EXPECT_EQ(1, node_map.count("b"));
-    EXPECT_EQ(1, node_map.count("c"));
-
-    result_graph_def.Clear();
-    TF_ASSERT_OK(graph_transforms::RemoveUnusedNodes(
-        graph_def, {{shape_n[0].name(), shape_n[1].name()}, {"c"}},
-        &result_graph_def));
-
-    // Both outputs of shape_n node are fed inputs. shape_n does not function
-    // and inputs to shape_n should be removed.
-    node_map.clear();
-    graph_transforms::MapNamesToNodes(result_graph_def, &node_map);
-    EXPECT_EQ(0, node_map.count("a"));
-    EXPECT_EQ(0, node_map.count("b"));
-    EXPECT_EQ(1, node_map.count("c"));
-  }
-
   void TestMaxConstantSizeInBytes() {
     auto root = tensorflow::Scope::NewRootScope();
 
@@ -431,10 +389,6 @@ TEST_F(ConstantFoldingTest, TestReplaceSendRecvsPrefixNames) {
 
 TEST_F(ConstantFoldingTest, TestRemoveUnusedNodes) { TestRemoveUnusedNodes(); }
 
-TEST_F(ConstantFoldingTest, TestRemoveUnusedNodesMultipleOutputs) {
-  TestRemoveUnusedNodesMultipleOutputs();
-}
-
 TEST_F(ConstantFoldingTest, TestMaxConstantSizeInBytes) {
   TestMaxConstantSizeInBytes();
 }
-- 
GitLab


From 6c1b8e8123bc6bd191d81ab9e095d340e31870bf Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 05:13:02 -0700
Subject: [PATCH 480/610] Detect configurations that would be hitting bugs in
 cuDNN and report an error.

PiperOrigin-RevId: 199780350
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 59 +++++++++++++++++++--
 1 file changed, 56 insertions(+), 3 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index f6564df0d0..48afc06e32 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -2291,9 +2291,7 @@ class CudnnEnvVar {
 // algorithm through an env-var "TF_ENABLE_FFT_TILING_FORWARD=1".
 struct FftTilingForward {
   static constexpr const char* kName = "TF_ENABLE_FFT_TILING_FORWARD";
-  // TODO(csigg): Enabling this algo causes XLA test failures, for example in
-  // platforms/xla/tests/internal:convolution_test_gpu. See b/80018418.
-  static constexpr bool kDefaultFlag = false;  // CUDNN_VERSION >= 7000;
+  static constexpr bool kDefaultFlag = CUDNN_VERSION >= 7000;
 };
 
 // A helper struct to decide whether to enable the WINOGRAD_NONFUSED algorithms.
@@ -2426,6 +2424,33 @@ port::Status CudnnSupport::DoConvolveImpl(
     }
   }
 
+  // Report an error if we might be hitting a cuDNN bug that accesses illegal
+  // memory. See nvbugs/2138754, b/80018418.
+  SE_RETURN_IF_ERROR([&] {
+    if (algo_desc.algo_id() != CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING) {
+      return port::Status::OK();
+    }
+    if (input_descriptor.ndims() < 3) {
+      return port::Status::OK();
+    }
+    // Checks that a*b is within the valid range (as provided by NVIDIA).
+    auto check_sizes = [](size_t a, size_t b) {
+      if ((a * b * 4608 - 1) >> 31 == 0) {
+        return port::Status::OK();
+      }
+      return port::Status(
+          port::error::FAILED_PRECONDITION,
+          "This configuration potentially accesses illegal memory.");
+    };
+    SE_RETURN_IF_ERROR(check_sizes(input_descriptor.feature_map_count(),
+                                   output_descriptor.feature_map_count()));
+    SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
+                                   input_descriptor.feature_map_count()));
+    SE_RETURN_IF_ERROR(check_sizes(input_descriptor.count(),
+                                   output_descriptor.feature_map_count()));
+    return port::Status::OK();
+  }());
+
   RETURN_IF_CUDNN_ERROR(cudnnConvolutionForward(
       cudnn.handle(),
       /*alpha=*/alpha, /*srcDesc=*/input_nd.handle(),
@@ -3192,6 +3217,34 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     }
   }
 
+  // Report an error if we might be hitting a cuDNN bug that produces incorrect
+  // results. See nvbugs/2072856
+  SE_RETURN_IF_ERROR([&] {
+    if (algo_desc.algo_id() != CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING) {
+      return port::Status::OK();
+    }
+    if (output_descriptor.height() > 1 && output_descriptor.width() > 1) {
+      return port::Status::OK();
+    }
+    int convolution_size = output_descriptor.height() > 1
+                               ? filter_descriptor.input_filter_height()
+                               : filter_descriptor.input_filter_width();
+    if (convolution_size <= 32) {
+      return port::Status::OK();
+    }
+    cudnnConvolutionMode_t convolution_mode;
+    cudnnDataType_t compute_type;
+    RETURN_IF_CUDNN_ERROR(cudnnGetConvolutionNdDescriptor(
+        conv.handle(), 0, nullptr, nullptr, nullptr, nullptr, &convolution_mode,
+        &compute_type));
+    if (convolution_mode != CUDNN_CONVOLUTION) {
+      return port::Status::OK();
+    }
+    return port::Status(
+        port::error::FAILED_PRECONDITION,
+        "This configuration potentially produces incorrect results.");
+  }());
+
   RETURN_IF_CUDNN_ERROR(cudnnConvolutionBackwardFilter(
       cudnn.handle(),
       /*alpha=*/alpha,
-- 
GitLab


From cd00aa747a6e6e023910998a744c0f43e1afddbf Mon Sep 17 00:00:00 2001
From: Adria Puigdomenech <adriap@google.com>
Date: Fri, 8 Jun 2018 05:42:27 -0700
Subject: [PATCH 481/610] Obtain use_locking for resource variables in
 scatter_nd_add.

PiperOrigin-RevId: 199782188
---
 tensorflow/core/kernels/scatter_nd_op.cc | 20 +++++---------------
 1 file changed, 5 insertions(+), 15 deletions(-)

diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index bdc268cf49..43c5b29509 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -143,14 +143,10 @@ class ScatterNdUpdateOp : public OpKernel {
 
   void Compute(OpKernelContext* c) override {
     if (dtype_ == DT_RESOURCE) {
-      if (use_exclusive_lock_) {
-        Var* v;
-        OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
-        mutex_lock m(*v->mu());
-        DoCompute(c);
-      } else {
-        DoCompute(c);
-      }
+      Var* v;
+      OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
+      mutex_lock m(*v->mu());
+      DoCompute(c);
     } else if (use_exclusive_lock_) {
       // If we're here, it means the input type is a ref.
       DCHECK(IsRefType(c->input_dtype(0)));
@@ -176,13 +172,7 @@ class ScatterNdUpdateOp : public OpKernel {
       Var* v;
       OP_REQUIRES_OK(c, LookupResource(c, HandleFromInput(c, 0), &v));
       Tensor* t = v->tensor();
-      if (!use_exclusive_lock_) {
-        // We're not holding the lock in the outer scope so need it here.
-        mutex_lock m(*v->mu());
-        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
-      } else {
-        OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
-      }
+      OP_REQUIRES_OK(c, PrepareToUpdateVariable<Device, T>(c, t));
       params = *t;
       params_shape = params.shape();
     } else if (IsRefType(c->input_dtype(0))) {
-- 
GitLab


From 7b5d9e86e77bb750d5b794f1673fc08d4d289ec7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 08:12:15 -0700
Subject: [PATCH 482/610] Fix a typo in toco flags description.

PiperOrigin-RevId: 199795176
---
 tensorflow/contrib/lite/toco/model_cmdline_flags.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
index 0f104d5e2d..4c9f1aa4b0 100644
--- a/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
+++ b/tensorflow/contrib/lite/toco/model_cmdline_flags.cc
@@ -48,7 +48,7 @@ bool ParseModelFlagsFromCommandLineFlags(
            "that information from the input file."),
       Flag("input_arrays", parsed_flags.input_arrays.bind(),
            parsed_flags.input_arrays.default_value(),
-           "Names of the output arrays, comma-separated. If not specified, "
+           "Names of the input arrays, comma-separated. If not specified, "
            "will try to read that information from the input file."),
       Flag("output_array", parsed_flags.output_array.bind(),
            parsed_flags.output_array.default_value(),
-- 
GitLab


From ef1555172d452539d749340cdb076f0a24f6c505 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 8 Jun 2018 09:00:06 -0700
Subject: [PATCH 483/610] [tf.data] Improve the error message for
 `Dataset.padded_batch()`.

Previously, we accepted the `padded_shapes` argument without validating that
it was compatible with the `input_dataset.output_shapes`. In many cases, we have
enough static shape information to do this, and so we now raise an actionable
error at the point where the mistake is committed, rather than at runtime.

PiperOrigin-RevId: 199800348
---
 tensorflow/contrib/data/python/ops/BUILD      |  1 +
 .../contrib/data/python/ops/batching.py       |  3 +-
 .../python/training/tensor_queue_dataset.py   |  7 +-
 .../kernel_tests/batch_dataset_op_test.py     | 38 ++++++++
 tensorflow/python/data/ops/dataset_ops.py     | 91 ++++++++++++++++---
 tensorflow/python/data/util/BUILD             |  1 +
 tensorflow/python/data/util/convert.py        | 37 ++++++++
 tensorflow/python/data/util/convert_test.py   | 73 +++++++++++++++
 8 files changed, 236 insertions(+), 15 deletions(-)

diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index fc8ec5961c..33b7a75046 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -144,6 +144,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
     ],
diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index b9393de4e9..50c2d17592 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.framework import with_shape
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -309,7 +310,7 @@ class DenseToSparseBatchDataset(dataset_ops.Dataset):
     return gen_dataset_ops.dense_to_sparse_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._batch_size,
-        row_shape=dataset_ops._partial_shape_to_tensor(self._row_shape),  # pylint: disable=protected-access
+        row_shape=convert.partial_shape_to_tensor(self._row_shape),
         output_shapes=nest.flatten(
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
         output_types=nest.flatten(
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
index 409aba817c..a2444934bc 100644
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
+++ b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
 from tensorflow.python.framework import dtypes
@@ -45,14 +46,14 @@ class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.Dataset):
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
-    # pylint: disable=protected-access
     if padded_shapes is None:
       self._padded_shapes = nest.map_structure(
-          dataset_ops._partial_shape_to_tensor, input_dataset.output_shapes)
+          convert.partial_shape_to_tensor, input_dataset.output_shapes)
     else:
       self._padded_shapes = nest.map_structure_up_to(
-          input_dataset.output_shapes, dataset_ops._partial_shape_to_tensor,
+          input_dataset.output_shapes, convert.partial_shape_to_tensor,
           padded_shapes)
+    # pylint: disable=protected-access
     padding_values = (
         padding_values if padding_values is not None else
         dataset_ops._default_padding(input_dataset))
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index bd80b9dbf5..dba108a531 100644
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -371,6 +371,44 @@ class BatchDatasetTest(test.TestCase):
     with self.assertRaises(TypeError):
       _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
 
+  def testPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(3,\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its shape was \(2, 2\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[[1, 1], [1, 1]])
+
+    with self.assertRaisesRegexp(
+        TypeError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its element type was float32.'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=constant_op.constant([1., 2., 3.]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(\?, \?\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 5f17444797..8b2a2e0a32 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1687,20 +1687,77 @@ class BatchDataset(Dataset):
     return self._input_dataset.output_types
 
 
-def _partial_shape_to_tensor(shape_like):
+def _is_padded_shape_compatible_with(padded_shape, input_component_shape):
+  """Returns `True` if `input_component_shape` can be padded to `padded_shape`.
+
+  Args:
+    padded_shape: A `tf.TensorShape`.
+    input_component_shape: A `tf.TensorShape`.
+
+  Returns:
+    `True` if `input_component_shape` can be padded to `padded_shape`, otherwise
+    `False`.
+  """
+
+  if padded_shape.dims is None or input_component_shape.dims is None:
+    return True
+  if len(padded_shape.dims) != len(input_component_shape.dims):
+    return False
+  for padded_dim, input_dim in zip(
+      padded_shape.dims, input_component_shape.dims):
+    if (padded_dim.value is not None and input_dim.value is not None
+        and padded_dim.value < input_dim.value):
+      return False
+  return True
+
+
+def _padded_shape_to_tensor(padded_shape, input_component_shape):
+  """Converts `padded_shape` to a `tf.Tensor` representing that shape.
+
+  Args:
+    padded_shape: A shape-like object, which may be a `tf.TensorShape`, a Python
+      sequence, or a 1-D `tf.Tensor` of `tf.int64` elements.
+    input_component_shape: A `tf.TensorShape`, with which `padded_shape` must
+      be compatible.
+
+  Returns:
+    A 1-D `tf.Tensor` of `tf.int64` elements, representing `padded_shape`.
+
+  Raises:
+    ValueError: If `padded_shape` is not a shape or not compatible with
+      `input_component_shape`.
+    TypeError: If `padded_shape` is not convertible to a `tf.int64` tensor.
+  """
   try:
-    # First attempt to convert the input to a shape, and return the
-    # "canonical" tensor representation, which uses `-1` in place of
-    # `None`.
-    shape_like = tensor_shape.as_shape(shape_like)
-    return ops.convert_to_tensor(
-        [dim if dim is not None else -1 for dim in shape_like.as_list()],
-        dtype=dtypes.int64)
+    # Try to convert the `padded_shape` to a `tf.TensorShape`
+    padded_shape_as_shape = tensor_shape.as_shape(padded_shape)
+    # We will return the "canonical" tensor representation, which uses
+    # `-1` in place of `None`.
+    ret = ops.convert_to_tensor(
+        [dim if dim is not None else -1
+         for dim in padded_shape_as_shape.as_list()], dtype=dtypes.int64)
   except (TypeError, ValueError):
     # The argument was not trivially convertible to a
     # `tf.TensorShape`, so fall back on the conversion to tensor
     # machinery.
-    return ops.convert_to_tensor(shape_like, dtype=dtypes.int64)
+    ret = ops.convert_to_tensor(padded_shape, preferred_dtype=dtypes.int64)
+    if ret.shape.dims is not None and len(ret.shape.dims) != 1:
+      raise ValueError(
+          "Padded shape %s must be a 1-D tensor of tf.int64 values, but its "
+          "shape was %s." % (padded_shape, ret.shape))
+    if ret.dtype != dtypes.int64:
+      raise TypeError(
+          "Padded shape %s must be a 1-D tensor of tf.int64 values, but its "
+          "element type was %s." % (padded_shape, ret.dtype.name))
+    padded_shape_as_shape = tensor_util.constant_value_as_shape(ret)
+
+  if not _is_padded_shape_compatible_with(padded_shape_as_shape,
+                                          input_component_shape):
+    raise ValueError("The padded shape %s is not compatible with the "
+                     "corresponding input component shape %s."
+                     % (padded_shape_as_shape, input_component_shape))
+
+  return ret
 
 
 def _padding_value_to_tensor(value, output_type):
@@ -1755,8 +1812,20 @@ class PaddedBatchDataset(Dataset):
     padding_values = (
         padding_values
         if padding_values is not None else _default_padding(input_dataset))
-    self._padded_shapes = nest.map_structure_up_to(
-        input_dataset.output_shapes, _partial_shape_to_tensor, padded_shapes)
+
+    flat_padded_shapes = nest.flatten_up_to(input_dataset.output_shapes,
+                                            padded_shapes)
+
+    flat_padded_shapes_as_tensors = []
+
+    for input_component_shape, padded_shape in zip(
+        nest.flatten(input_dataset.output_shapes), flat_padded_shapes):
+      flat_padded_shapes_as_tensors.append(
+          _padded_shape_to_tensor(padded_shape, input_component_shape))
+
+    self._padded_shapes = nest.pack_sequence_as(input_dataset.output_shapes,
+                                                flat_padded_shapes_as_tensors)
+
     self._padding_values = nest.map_structure_up_to(
         input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
         input_dataset.output_types)
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 0fc32d51b9..5fcc62b60b 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -70,6 +70,7 @@ py_library(
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
     ],
 )
 
diff --git a/tensorflow/python/data/util/convert.py b/tensorflow/python/data/util/convert.py
index eeb1d700f3..99b3300900 100644
--- a/tensorflow/python/data/util/convert.py
+++ b/tensorflow/python/data/util/convert.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 
 
 def optional_param_to_tensor(argument_name,
@@ -32,3 +33,39 @@ def optional_param_to_tensor(argument_name,
   else:
     return constant_op.constant(
         argument_default, dtype=argument_dtype, name=argument_name)
+
+
+def partial_shape_to_tensor(shape_like):
+  """Returns a @{tf.Tensor} that represents the given shape.
+
+  Args:
+    shape_like: A value that can be converted to a @{tf.TensorShape} or a
+      @{tf.Tensor}.
+
+  Returns:
+    A 1-D `tf.Tensor` of `tf.int64` elements representing the given shape, where
+    `-1` is substituted for any unknown dimensions.
+  """
+  try:
+    # First attempt to convert the input to a shape, and return the
+    # "canonical" tensor representation, which uses `-1` in place of
+    # `None`.
+    shape_like = tensor_shape.as_shape(shape_like)
+    return ops.convert_to_tensor(
+        [dim if dim is not None else -1 for dim in shape_like.as_list()],
+        dtype=dtypes.int64)
+  except (TypeError, ValueError):
+    # The argument was not trivially convertible to a
+    # `tf.TensorShape`, so fall back on the conversion to tensor
+    # machinery.
+    ret = ops.convert_to_tensor(shape_like, preferred_dtype=dtypes.int64)
+    if ret.shape.dims is not None and len(ret.shape.dims) != 1:
+      raise ValueError("The given shape %s must be a 1-D tensor of tf.int64 "
+                       "values, but the shape was %s."
+                       % (shape_like, ret.shape))
+    if ret.dtype != dtypes.int64:
+      raise TypeError("The given shape %s must be a 1-D tensor of tf.int64 "
+                      "values, but the element type was %s."
+                      % (shape_like, ret.dtype.name))
+
+    return ret
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
index 2cb6488070..6a67093e48 100644
--- a/tensorflow/python/data/util/convert_test.py
+++ b/tensorflow/python/data/util/convert_test.py
@@ -19,7 +19,9 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.util import convert
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -48,6 +50,77 @@ class ConvertTest(test.TestCase):
     with self.test_session() as sess:
       self.assertEqual(compat.as_bytes("value"), sess.run(resp))
 
+  def testPartialShapeToTensorKnownDimension(self):
+    with self.test_session() as sess:
+      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
+          tensor_shape.TensorShape([1]))))
+      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor((1,))))
+      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor([1])))
+      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
+          constant_op.constant([1], dtype=dtypes.int64))))
+
+  def testPartialShapeToTensorUnknownDimension(self):
+    with self.test_session() as sess:
+      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
+          tensor_shape.TensorShape([None]))))
+      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
+          (None,))))
+      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
+          [None])))
+      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
+          [-1])))
+      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
+          constant_op.constant([-1], dtype=dtypes.int64))))
+
+    with self.assertRaisesRegexp(
+        ValueError, r"The given shape .* must be a 1-D tensor of tf.int64 "
+        r"values, but the shape was \(2, 2\)."):
+      convert.partial_shape_to_tensor(constant_op.constant(
+          [[1, 1], [1, 1]], dtype=dtypes.int64))
+
+    with self.assertRaisesRegexp(
+        TypeError, r"The given shape .* must be a 1-D tensor of tf.int64 "
+        r"values, but the element type was float32."):
+      convert.partial_shape_to_tensor(constant_op.constant([1., 1.]))
+
+  def testPartialShapeToTensorMultipleDimensions(self):
+    with self.test_session() as sess:
+      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
+          tensor_shape.TensorShape([3, 6]))))
+      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
+          (3, 6))))
+      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
+          [3, 6])))
+      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
+          constant_op.constant([3, 6], dtype=dtypes.int64))))
+
+      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
+          tensor_shape.TensorShape([3, None]))))
+      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
+          (3, None))))
+      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
+          [3, None])))
+      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
+          constant_op.constant([3, -1], dtype=dtypes.int64))))
+
+      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
+          tensor_shape.TensorShape([None, None]))))
+      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
+          (None, None))))
+      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
+          [None, None])))
+      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
+          constant_op.constant([-1, -1], dtype=dtypes.int64))))
+
+  def testPartialShapeToTensorScalar(self):
+    with self.test_session() as sess:
+      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
+          tensor_shape.TensorShape([]))))
+      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(())))
+      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor([])))
+      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
+          constant_op.constant([], dtype=dtypes.int64))))
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 1faacc23e3341645ce11a9720775cb27c0694f4d Mon Sep 17 00:00:00 2001
From: Rachel Lim <rachelim@google.com>
Date: Fri, 8 Jun 2018 09:48:26 -0700
Subject: [PATCH 484/610] [tf.data] tf.contrib.data.CsvDataset: Add recovery
 for errors with quoted fields

PiperOrigin-RevId: 199807061
---
 .../contrib/data/kernels/csv_dataset_op.cc    | 84 +++++++------------
 .../kernel_tests/csv_dataset_op_test.py       | 21 ++++-
 2 files changed, 50 insertions(+), 55 deletions(-)

diff --git a/tensorflow/contrib/data/kernels/csv_dataset_op.cc b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
index e88ad3dc32..4657807785 100644
--- a/tensorflow/contrib/data/kernels/csv_dataset_op.cc
+++ b/tensorflow/contrib/data/kernels/csv_dataset_op.cc
@@ -236,7 +236,7 @@ class CSVDatasetOp : public DatasetOpKernel {
         size_t num_parsed = 0;
         size_t num_selected_parsed = 0;
 
-        Status result = Status::OK();
+        Status result;
 
         while (!end_of_record) {  // Read till we reach \n, \r or EOF
           bool include =
@@ -329,6 +329,7 @@ class CSVDatasetOp : public DatasetOpKernel {
         size_t start = pos_;
         pos_++;  // Starting quotation mark
 
+        Status parse_result;
         while (true) {  // Each iter reads 1 char, filling buffer if necessary
           if (pos_ >= buffer_.size()) {
             Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
@@ -351,8 +352,9 @@ class CSVDatasetOp : public DatasetOpKernel {
               if (errors::IsOutOfRange(s)) {
                 // This was the last field. We are done
                 *end_of_record = true;
-                return QuotedFieldToOutput(ctx, StringPiece(), out_tensors,
-                                           earlier_pieces, include);
+                parse_result.Update(QuotedFieldToOutput(
+                    ctx, StringPiece(), out_tensors, earlier_pieces, include));
+                return parse_result;
               } else if (!s.ok()) {
                 return s;
               }
@@ -361,20 +363,24 @@ class CSVDatasetOp : public DatasetOpKernel {
             char next = buffer_[pos_];
             pos_++;
             if (next == dataset()->delim_) {
-              return QuotedFieldToOutput(
+              parse_result.Update(QuotedFieldToOutput(
                   ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
-                  out_tensors, earlier_pieces, include);
+                  out_tensors, earlier_pieces, include));
+              return parse_result;
 
             } else if (next == '\n' || next == '\r') {
               *end_of_record = true;
-              Status s = QuotedFieldToOutput(
+              parse_result.Update(QuotedFieldToOutput(
                   ctx, StringPiece(&buffer_[start], pos_ - 1 - start),
-                  out_tensors, earlier_pieces, include);
+                  out_tensors, earlier_pieces, include));
               if (next == '\r') SkipNewLineIfNecessary();
-              return s;
+              return parse_result;
             } else if (next != '"') {
-              return errors::InvalidArgument(
-                  "Quote inside a string has to be escaped by another quote");
+              // Take note of the error, but keep going to end of field.
+              include = false;  // So we don't get funky errors when trying to
+                                // unescape the quotes.
+              parse_result.Update(errors::InvalidArgument(
+                  "Quote inside a string has to be escaped by another quote"));
             }
 
           } else {
@@ -454,6 +460,8 @@ class CSVDatasetOp : public DatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         std::vector<Piece> earlier_pieces;
         size_t start = pos_;
+        Status parse_result;
+
         while (true) {  // Each iter reads 1 char, filling buffer if necessary
           if (pos_ >= buffer_.size()) {
             Status s = SaveAndFillBuffer(&earlier_pieces, &start, include);
@@ -461,9 +469,10 @@ class CSVDatasetOp : public DatasetOpKernel {
             if (errors::IsOutOfRange(s)) {
               // Whatever we have is the last field of the last record
               *end_of_record = true;
-              return UnquotedFieldToOutput(
+              parse_result.Update(UnquotedFieldToOutput(
                   ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                  earlier_pieces, include);
+                  earlier_pieces, include));
+              return parse_result;
             } else if (!s.ok()) {
               return s;  // Surface all other errors to caller
             }
@@ -472,66 +481,33 @@ class CSVDatasetOp : public DatasetOpKernel {
           char ch = buffer_[pos_];
 
           if (ch == dataset()->delim_) {
-            Status s = UnquotedFieldToOutput(
+            parse_result.Update(UnquotedFieldToOutput(
                 ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                earlier_pieces, include);
+                earlier_pieces, include));
             pos_++;
-            return s;
+            return parse_result;
           }
           if (ch == '\n' || ch == '\r') {
             // need special case to skip over first \n of record if the line
             // breaks are \r\n
-            Status s = UnquotedFieldToOutput(
+            parse_result.Update(UnquotedFieldToOutput(
                 ctx, StringPiece(&buffer_[start], pos_ - start), out_tensors,
-                earlier_pieces, include);
+                earlier_pieces, include));
             *end_of_record = true;
             pos_++;
             if (ch == '\r') SkipNewLineIfNecessary();
-            return s;
+            return parse_result;
           }
           if (dataset()->use_quote_delim_ && ch == '"') {
-            // Advance pos_ to the next field anyway so that we can ignore
-            // errors gracefully if required. The caller of this will be able to
-            // call ParseOneField and continue with the rest of the record.
-            AdvanceToNextField(end_of_record);
-            return errors::InvalidArgument(
-                "Unquoted fields cannot have quotes inside");
+            // Take note of the error, but keep going to end of field.
+            parse_result.Update(errors::InvalidArgument(
+                "Unquoted fields cannot have quotes inside"));
           }
           // Otherwise, go to next character
           pos_++;
         }
       }
 
-      // Advances pos_ to the start of the next field, as delimited by delim,
-      // CRLF, or EOF, ignoring errors, and not keeping track of characters in
-      // the current field.
-      void AdvanceToNextField(bool* end_of_record)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        while (true) {
-          if (pos_ >= buffer_.size()) {
-            Status s = FillBuffer(&buffer_);
-            pos_ = 0;
-            if (!s.ok()) {
-              *end_of_record = true;
-              return;
-            }
-          }
-
-          char ch = buffer_[pos_];
-          pos_++;
-
-          if (ch == dataset()->delim_) {
-            return;
-          }
-
-          if (ch == '\n' || ch == '\r') {
-            *end_of_record = true;
-            if (ch == '\r') SkipNewLineIfNecessary();
-            return;
-          }
-        }
-      }
-
       Status FillBuffer(string* result) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         result->clear();
         Status s = input_stream_->ReadNBytes(dataset()->buffer_size_, result);
diff --git a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
index 74b90ec7d1..97b5e94165 100644
--- a/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/csv_dataset_op_test.py
@@ -162,9 +162,28 @@ class CsvDatasetOpTest(test.TestCase):
         expected_err_re='Unquoted fields cannot have quotes inside',
         record_defaults=record_defaults)
 
+  def testCsvDataset_errWithUnescapedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['"a"b","c","d"']]
+    self._test_dataset(
+        inputs,
+        expected_err_re=
+        'Quote inside a string has to be escaped by another quote',
+        record_defaults=record_defaults)
+
+  def testCsvDataset_ignoreErrWithUnescapedQuotes(self):
+    record_defaults = [['']] * 3
+    inputs = [['1,"2"3",4', '1,"2"3",4",5,5', 'a,b,"c"d"', 'e,f,g']]
+    filenames = self.setup_files(inputs)
+    with ops.Graph().as_default() as g:
+      with self.test_session(graph=g) as sess:
+        dataset = readers.CsvDataset(filenames, record_defaults=record_defaults)
+        dataset = dataset.apply(error_ops.ignore_errors())
+        self._verify_output_or_err(sess, dataset, [['e', 'f', 'g']])
+
   def testCsvDataset_ignoreErrWithUnquotedQuotes(self):
     record_defaults = [['']] * 3
-    inputs = [['1,2"3,4', 'a,b,c"d', 'e,f,g']]
+    inputs = [['1,2"3,4', 'a,b,c"d', '9,8"7,6,5', 'e,f,g']]
     filenames = self.setup_files(inputs)
     with ops.Graph().as_default() as g:
       with self.test_session(graph=g) as sess:
-- 
GitLab


From 8566ebe58ff5b08864ddef6fe743fdd80962465b Mon Sep 17 00:00:00 2001
From: Thomas Joerg <tjoerg@google.com>
Date: Fri, 8 Jun 2018 09:52:21 -0700
Subject: [PATCH 485/610] [XLA:GPU] Add a mulit-output fusion pass to fuse
 sibling reduce instructions.

Stop creating pre-fused nodes in BatchNormExpander.

PiperOrigin-RevId: 199807585
---
 tensorflow/compiler/xla/service/gpu/BUILD     |  29 ++++
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   6 +-
 .../xla/service/gpu/multi_output_fusion.cc    | 118 +++++++++++++++
 .../xla/service/gpu/multi_output_fusion.h     |  55 +++++++
 .../service/gpu/multi_output_fusion_test.cc   | 138 ++++++++++++++++++
 5 files changed, 343 insertions(+), 3 deletions(-)
 create mode 100644 tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
 create mode 100644 tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
 create mode 100644 tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc

diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 5e5ca7c72c..5e02631a58 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -423,6 +423,34 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "multi_output_fusion",
+    srcs = ["multi_output_fusion.cc"],
+    hdrs = ["multi_output_fusion.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:multi_output_fusion",
+        "//tensorflow/core:lib",
+    ],
+)
+
+tf_cc_test(
+    name = "multi_output_fusion_test",
+    srcs = ["multi_output_fusion_test.cc"],
+    deps = [
+        ":multi_output_fusion",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "gpu_copy_insertion",
     srcs = ["gpu_copy_insertion.cc"],
@@ -523,6 +551,7 @@ cc_library(
         ":instruction_fusion",
         ":ir_emission_utils",
         ":ir_emitter",
+        ":multi_output_fusion",
         ":pad_insertion",
         ":partition_assignment",
         ":stream_assignment",
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index b857219807..c995736af9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -52,6 +52,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_context.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h"
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
 #include "tensorflow/compiler/xla/service/gpu/pad_insertion.h"
 #include "tensorflow/compiler/xla/service/gpu/partition_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_assignment.h"
@@ -159,13 +160,11 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       if (hlo_module->config().debug_options().xla_gpu_use_cudnn_batchnorm()) {
         pass.AddPass<CudnnBatchNormRewriter>();
       }
-      // TODO(kramerb): Remove use_fusion once instruction fusion can create
-      // multi-output fusions from the unfused expander output.
       pass.AddPass<BatchNormExpander>(
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
-          /*use_fusion=*/true);
+          /*use_fusion=*/false);
 
       // Rewrite gather ops into smaller ones.
       pass.AddPass<GatherExpander>();
@@ -261,6 +260,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/false);
     fusion.AddPass<GpuInstructionFusion>(/*may_duplicate=*/true);
     fusion.AddPass<FusionMerger>();
+    fusion.AddPass<GpuMultiOutputFusion>();
     TF_RETURN_IF_ERROR(fusion.Run(hlo_module).status());
 
     HloPassPipeline reduce_pipeline("reduce-precision");
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
new file mode 100644
index 0000000000..86c5c4fb6f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
+
+#include <stdint.h>
+#include <algorithm>
+#include <iterator>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace gpu {
+
+GpuMultiOutputFusion::GpuMultiOutputFusion() : MultiOutputFusion(INT64_MAX) {}
+
+bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
+                                                     HloInstruction* instr2) {
+  auto get_element_shape = [&](HloInstruction* instr) {
+    const HloInstruction* element_instr = instr;
+    if (instr->opcode() == HloOpcode::kFusion) {
+      auto fused_expression_root = instr->fused_expression_root();
+      if (instr->IsMultiOutputFusion()) {
+        // The shapes in all tuple operands should agree. Just pick the first
+        // one.
+        element_instr = fused_expression_root->operands()[0];
+      } else {
+        element_instr = fused_expression_root;
+      }
+    }
+    return element_instr->shape();
+  };
+
+  // The elementwise output shapes must be the same (including layout)
+  return ShapeUtil::ShapeUtil::Equal(get_element_shape(instr1),
+                                     get_element_shape(instr2));
+}
+
+bool GpuMultiOutputFusion::IsProfitableOperand(HloInstruction* instr) {
+  // kConstant instruction will not have memory reads, so it won't be a profit
+  // source. Skip them.
+  if (instr->opcode() == HloOpcode::kConstant &&
+      ShapeUtil::IsEffectiveScalar(instr->shape())) {
+    return false;
+  }
+  // We don't target to fuse producer/consumer instructions -- this should
+  // be taken care of by the instruction_fusion pass. If instr has only
+  // one user, it will not have sibling instructions. We won't consider it.
+  if (instr->user_count() < 2) {
+    return false;
+  }
+  return true;
+}
+
+namespace {
+bool IsReduction(HloInstruction* instr) {
+  if (instr->IsMultiOutputFusion()) {
+    for (const HloInstruction* operand :
+         instr->fused_expression_root()->operands()) {
+      if (operand->opcode() == HloOpcode::kReduce) {
+        return true;
+      }
+    }
+    return false;
+  } else if (instr->opcode() == HloOpcode::kFusion) {
+    return instr->fused_expression_root()->opcode() == HloOpcode::kReduce;
+  } else {
+    return instr->opcode() == HloOpcode::kReduce;
+  }
+}
+}  // namespace
+
+bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
+  return IsReduction(instr);
+}
+
+int64 GpuMultiOutputFusion::GetProfit(HloInstruction* instr1,
+                                      HloInstruction* instr2) {
+  tensorflow::gtl::FlatSet<HloInstruction*> in_list;
+  for (auto instr : instr1->operands()) {
+    if (!IsProfitableOperand(instr)) {
+      continue;
+    }
+    in_list.insert(instr);
+  }
+  int64 profit = 0;
+  for (auto instr : instr2->operands()) {
+    if (!IsProfitableOperand(instr) || in_list.count(instr) == 0) {
+      continue;
+    }
+    profit += ShapeUtil::ByteSizeOf(instr->shape());
+  }
+  VLOG(2) << "Fusing instr1=" << instr1->name() << " instr2=" << instr2->name()
+          << ", the profit is =" << profit;
+  return profit;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
new file mode 100644
index 0000000000..5451a93cec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.h
@@ -0,0 +1,55 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
+
+#include "tensorflow/compiler/xla/service/multi_output_fusion.h"
+
+namespace xla {
+namespace gpu {
+
+// Multi-output fusion of sibling and producer-consumer instructions for the
+// Jellyfish backend.
+class GpuMultiOutputFusion : public MultiOutputFusion {
+ public:
+  GpuMultiOutputFusion();
+
+ protected:
+  // Test if instr1 and instr2 have the compatible shapes that can be legally
+  // fused.
+  bool ShapesCompatibleForFusion(HloInstruction* instr1,
+                                 HloInstruction* instr2) override;
+
+  // We currently only consider reduce and reduce fusion nodes as candidates.
+  bool IsFusible(HloInstruction* instr) override;
+
+  // This function estimates the amount of memory reads saved by merging
+  // instr1 and instr2 into one multi-output fusion instruction. For a fusion
+  // instruction, all the operands need to be loaded from memory. If we merge
+  // instr1 and instr2, common operands will not be loaded twice. The profit is
+  // estimated as the size of the common operands b/w instr1 and instr2.
+  int64 GetProfit(HloInstruction* instr1, HloInstruction* instr2) override;
+
+  // Whether fusing the instruction can reduce memory reads.
+  //
+  // TODO(tjoerg): Move this method up into the MultiOutputFusion base class.
+  bool IsProfitableOperand(HloInstruction* instr) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_MULTI_OUTPUT_FUSION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
new file mode 100644
index 0000000000..d0b4c88487
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/multi_output_fusion.h"
+
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace gpu {
+
+using InstructionFusionTest = HloTestBase;
+
+const char kModulePrefix[] = R"(
+    HloModule test_module
+
+    scalar_add_computation {
+      scalar_lhs = f32[] parameter(0)
+      scalar_rhs = f32[] parameter(1)
+      ROOT add = f32[] add(scalar_lhs, scalar_rhs)
+    })";
+
+TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
+  // Fusion with reduce instruction root and a sibling reduce instruction
+  // sharing the same input param.
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation {
+      p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+      mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1)
+      const.1 = f32[] parameter(0)
+      ROOT reduce.1 = f32[512]{0} reduce(mul, const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[] parameter(0)
+      p1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+      const.2 = f32[] constant(1)
+      fusion = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation
+      reduce.2 = f32[512]{0} reduce(p1, const.2), dimensions={0,2,3}, to_apply=scalar_add_computation
+      ROOT root = (f32[512]{0}, f32[512]{0}) tuple(fusion, reduce.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceFusions) {
+  // Two sibling fusions with reduce instruction roots sharing the same input
+  // param.
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p1.1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+      mul = f32[128,512,28,28]{3,2,1,0} multiply(p1.1, p1.1)
+      const.1 = f32[] parameter(0)
+      ROOT reduce.1 = f32[512]{0} reduce(mul, const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+    }
+
+    fused_computation_2 {
+      p1.2 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+      const.2 = f32[] parameter(0)
+      ROOT reduce.2 = f32[512]{0} reduce(p1.2, const.2), dimensions={0,2,3}, to_apply=scalar_add_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[] parameter(0)
+      p1 = f32[128,512,28,28]{3,2,1,0} parameter(1)
+      fusion.1 = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation_1
+      fusion.2 = f32[512] fusion(p0, p1), kind=kInput, calls=fused_computation_2
+      ROOT root = (f32[512]{0}, f32[512]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce()));
+}
+
+TEST_F(InstructionFusionTest,
+       MultiOutputFusionSiblingReduceAndReduceMultiOutputFusion) {
+  // Multi-output fusion with two reduce instructions root and a sibling reduce
+  // instruction sharing the same input param.
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation (p0: f32[128,512,28,28]) -> (f32[512], f32[512]) {
+      const.1 = f32[] constant(1)
+      p0.1 = f32[128,512,28,28]{3,2,1,0} parameter(0)
+      mul = f32[128,512,28,28]{3,2,1,0} multiply(f32[128,512,28,28]{3,2,1,0} p0.1, f32[128,512,28,28]{3,2,1,0} p0.1)
+      reduce.1 = f32[512]{0} reduce(f32[128,512,28,28]{3,2,1,0} mul, f32[] const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+      reduce.2 = f32[512]{0} reduce(f32[128,512,28,28]{3,2,1,0} p0.1, f32[] const.1), dimensions={0,2,3}, to_apply=scalar_add_computation
+      ROOT tuple = (f32[512]{0}, f32[512]{0}) tuple(f32[512]{0} reduce.1, f32[512]{0} reduce.2)
+    }
+
+    ENTRY entry (p0: f32[128,512,28,28]) -> (f32[512], f32[512], f32[512]) {
+      p0 = f32[128,512,28,28]{3,2,1,0} parameter(0)
+      const = f32[] constant(1)
+      fusion = (f32[512]{0}, f32[512]{0}) fusion(f32[128,512,28,28]{3,2,1,0} p0), kind=kInput, calls=fused_computation
+      get-tuple-element = f32[512]{0} get-tuple-element((f32[512]{0}, f32[512]{0}) fusion), index=0
+      get-tuple-element.1 = f32[512]{0} get-tuple-element((f32[512]{0}, f32[512]{0}) fusion), index=1
+      reduce.3 = f32[512]{0} reduce(p0, const), dimensions={0,2,3}, to_apply=scalar_add_computation
+      ROOT root = (f32[512]{0}, f32[512]{0}, f32[512]{0}) tuple(f32[512]{0} get-tuple-element, f32[512]{0} get-tuple-element.1, f32[512]{0} reduce.3)
+    })"))
+                    .ValueOrDie();
+  ASSERT_TRUE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+  SCOPED_TRACE(module->ToString());
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  ASSERT_TRUE(fusion->IsMultiOutputFusion());
+  EXPECT_THAT(fusion->fused_expression_root(),
+              op::Tuple(op::Reduce(), op::Reduce(), op::Reduce()));
+}
+
+}  // namespace gpu
+}  // namespace xla
-- 
GitLab


From 0ef76693fdab2a4d1a4923444a2593f79a6b7873 Mon Sep 17 00:00:00 2001
From: Dimitris Vardoulakis <dimvar@google.com>
Date: Fri, 8 Jun 2018 10:02:44 -0700
Subject: [PATCH 486/610] Automated g4 rollback of changelist 199308328

PiperOrigin-RevId: 199809082
---
 .../xla/service/algebraic_simplifier_test.cc  | 47 +++++++++----------
 tensorflow/compiler/xla/tests/hlo_test_base.h | 17 +++----
 .../xla/tests/hlo_verified_test_base.cc       | 20 +++++---
 .../xla/tests/hlo_verified_test_base.h        | 16 ++++++-
 4 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index cda157f9fa..27eb48181e 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -1714,7 +1714,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1759,7 +1759,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
   EXPECT_TRUE(has_negative_padding(pad));
 
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
   EXPECT_FALSE(
@@ -1781,7 +1781,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1804,7 +1804,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
 }
@@ -1932,7 +1932,8 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     b.AddInstruction(HloInstruction::CreateConvolve(out_shape, input, filter,
                                                     window, dnums));
 
-    auto module = CreateNewModule();
+    // TODO(b/80488902): verify this module.
+    auto module = HloTestBase::CreateNewModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
     AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
@@ -2060,7 +2061,7 @@ TEST_F(AlgebraicSimplifierTest, MaxMinToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2090,7 +2091,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2121,7 +2122,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxWithBroadcastToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Clamp(max_value, param0, min_value));
@@ -2151,7 +2152,7 @@ TEST_F(AlgebraicSimplifierTest, MinMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Maximum(param0, max_value), min_value));
@@ -2184,7 +2185,7 @@ TEST_F(AlgebraicSimplifierTest, MinEquationWithMaxNotToClamp) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  EXPECT_FALSE(simplifier.Run(module).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
               op::Minimum(op::Add(op::Maximum(param0, max_value), max_value),
@@ -2200,10 +2201,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
       HloInstruction::CreateParameter(0, r0f32, "scalar_param"));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
-  HloInstruction* broadcast =
-      builder.AddInstruction(HloInstruction::CreateBroadcast(
-          broadcast_shape, scalar_param,
-          AsInt64Slice(broadcast_shape.dimensions())));
+  HloInstruction* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(broadcast_shape, scalar_param, {}));
 
   Shape slice_shape = ShapeUtil::MakeShape(F32, {2, 2, 3, 3});
   HloInstruction* slice = builder.AddInstruction(HloInstruction::CreateSlice(
@@ -2219,10 +2218,10 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
 
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
-  ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_FALSE(simplifier.Run(module).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(scalar_param));
@@ -2237,10 +2236,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
       HloInstruction::CreateConstant(Literal::CreateR0<float>(42.0f)));
 
   Shape broadcast_shape = ShapeUtil::MakeShape(F32, {4, 5, 6});
-  HloInstruction* broadcast =
-      builder.AddInstruction(HloInstruction::CreateBroadcast(
-          broadcast_shape, forty_two,
-          AsInt64Slice(broadcast_shape.dimensions())));
+  HloInstruction* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(broadcast_shape, forty_two, {}));
 
   HloInstruction* transpose =
       builder.AddInstruction(HloInstruction::CreateTranspose(
@@ -2259,7 +2256,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   root = computation->root_instruction();
   EXPECT_THAT(root, op::Broadcast(forty_two));
@@ -2268,7 +2265,8 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
 TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
-  auto module = CreateNewModule();
+  // TODO(b/80488902): verify this module.
+  auto module = HloTestBase::CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2349,7 +2347,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 // Test that ReduceWindow(Convert(Pad(op, x)), y) can simplify to
 // ReduceWindow(Convert(op), x).
 TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
-  auto module = CreateNewModule();
+  // TODO(b/80488902): verify this module.
+  auto module = HloTestBase::CreateNewModule();
   HloComputation::Builder builder(TestName());
 
   // Create operand to the pad.
@@ -2444,7 +2443,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
 
   AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
                                  non_bitcasting_callback());
-  ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  ASSERT_TRUE(simplifier.Run(module).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(a, root);
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index eb3a2ea76a..249da87f48 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -66,6 +66,15 @@ namespace xla {
 //
 // For a more detailed example, see "../tests/sample_text_test.cc".
 class HloTestBase : public ::testing::Test {
+ public:
+  // Creates a new HLO module for a test. The module created will have
+  // TestName() for its name; it will also automatically populate its debug
+  // options from command-line flags. If you want a fresh HloModule object and
+  // then add HloComputations to it, it's recommended to use this method in your
+  // tests.
+  static std::unique_ptr<HloModule> CreateNewModule(
+      const string& name = TestName());
+
  protected:
   // This uses the interpreter backend as the reference backend and
   // automatically finds another supported backend as the test backend. If the
@@ -80,14 +89,6 @@ class HloTestBase : public ::testing::Test {
 
   ~HloTestBase() override {}
 
-  // Creates a new HLO module for a test. The module created will have
-  // TestName() for its name; it will also automatically populate its debug
-  // options from command-line flags. If you want a fresh HloModule object and
-  // then add HloComputations to it, it's recommended to use this method in your
-  // tests.
-  static std::unique_ptr<HloModule> CreateNewModule(
-      const string& name = TestName());
-
   // Populates debug options from command-line flags and adjusts the options for
   // testing. It is recommended to use this when you need to pass in
   // DebugOptions, e.g. when creating a module from a string or a file.
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
index c8a05c2e9e..22c664d142 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.cc
@@ -41,14 +41,17 @@ void HloVerifiedTestBase::TearDown() {
       << "TearDown called more than once; it should be called exactly once.";
   tear_down_called_ = true;
   if (module_) {
-    VerifyModule();
+    VerifyModule(module_.get());
+  }
+  for (int i = 0; i < modules_.size(); ++i) {
+    VerifyModule(modules_.at(i).get());
   }
   HloTestBase::TearDown();
 }
 
-void HloVerifiedTestBase::VerifyModule() {
-  HloVerifier verifier;
-  xla::StatusOr<bool> mutated = verifier.Run(module_.get());
+void HloVerifiedTestBase::VerifyModule(HloModule* module) {
+  HloVerifier verifier(/*allow_mixed_precision=*/true);
+  xla::StatusOr<bool> mutated = verifier.Run(module);
   if (!mutated.ok()) {
     ADD_FAILURE() << "HloVerifier failed: " << mutated.status();
   } else {
@@ -59,15 +62,20 @@ void HloVerifiedTestBase::VerifyModule() {
 
 HloModule& HloVerifiedTestBase::module() {
   if (!module_) {
-    module_ = CreateNewModule();
+    module_ = HloTestBase::CreateNewModule();
   }
   return *module_;
 }
 
+HloModule* HloVerifiedTestBase::CreateNewModule(const string& name) {
+  modules_.emplace_back(HloTestBase::CreateNewModule());
+  return modules_.back().get();
+}
+
 void HloVerifiedTestBase::ParseAndVerifyModule(
     tensorflow::StringPiece hlo_text) {
   CHECK(!module_) << "Called ParseModule when test already has a module.";
   TF_ASSERT_OK_AND_ASSIGN(module_, ParseHloString(hlo_text));
-  VerifyModule();
+  VerifyModule(module_.get());
 }
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
index e5bb14a883..5b59cc77f6 100644
--- a/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_verified_test_base.h
@@ -52,11 +52,23 @@ class HloVerifiedTestBase : public HloTestBase {
     shape_verifier_ = std::move(shape_verifier);
   }
 
+  // Creates a new module for a test, and stores it in modules_ so it can be
+  // verified. Intentionally hides HloTestBase::CreateNewModule, to prevent
+  // creation of unverified modules.
+  HloModule* CreateNewModule(const string& name = TestName());
+
+  // It is confusing to store modules created by module() and CreateNewModule()
+  // in different fields, but it allows us to migrate tests to
+  // HloVerifiedTestBase more easily, so it's a win because we can verify more
+  // modules. See b/80488902.
  private:
-  std::unique_ptr<HloModule> module_;  // Lazily populated. Access via module().
+  // Lazily populated. Access via module().
+  std::unique_ptr<HloModule> module_;
+  // Populated by calls to CreateNewModule.
+  std::vector<std::unique_ptr<HloModule>> modules_;
   std::unique_ptr<ShapeVerifier> shape_verifier_;
   bool tear_down_called_ = false;
-  void VerifyModule();
+  static void VerifyModule(HloModule* module);
 };
 
 }  // namespace xla
-- 
GitLab


From da68f5f45b6b568fecffd53cba0ce382f0d034f9 Mon Sep 17 00:00:00 2001
From: Hsien-Yang Li <seanli9jan@gmail.com>
Date: Sat, 9 Jun 2018 01:35:48 +0800
Subject: [PATCH 487/610] Add decode uint16 PNG images support for
 tf.image.decode_image. (#18628)

* Add decode uint16 images support for tf.image.decode_image.

* Decode to a tensor with dtype.

* Add testcase for decode_image.

* Add float32 testcase for decode_image.

* Fix build error

* Regenerate the tensorflow.image.pbtxt
---
 tensorflow/python/ops/image_ops_impl.py       | 22 +++--
 tensorflow/python/ops/image_ops_test.py       | 83 +++++++++++++++++++
 .../tools/api/golden/tensorflow.image.pbtxt   |  2 +-
 3 files changed, 98 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 4a32f2351b..95d05cd4d1 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1556,13 +1556,13 @@ def is_jpeg(contents, name=None):
 
 
 @tf_export('image.decode_image')
-def decode_image(contents, channels=None, name=None):
+def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
 
   Detects whether an image is a BMP, GIF, JPEG, or PNG, and performs the
-  appropriate operation to convert the input bytes `string` into a `Tensor` of
-  type `uint8`.
+  appropriate operation to convert the input bytes `string` into a `Tensor`
+  of type `dtype`.
 
   Note: `decode_gif` returns a 4-D array `[num_frames, height, width, 3]`, as
   opposed to `decode_bmp`, `decode_jpeg` and `decode_png`, which return 3-D
@@ -1574,10 +1574,11 @@ def decode_image(contents, channels=None, name=None):
     contents: 0-D `string`. The encoded image bytes.
     channels: An optional `int`. Defaults to `0`. Number of color channels for
       the decoded image.
+    dtype: The desired DType of the returned `Tensor`.
     name: A name for the operation (optional)
 
   Returns:
-    `Tensor` with type `uint8` with shape `[height, width, num_channels]` for
+    `Tensor` with type `dtype` and shape `[height, width, num_channels]` for
       BMP, JPEG, and PNG images and shape `[num_frames, height, width, 3]` for
       GIF images.
 
@@ -1601,7 +1602,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding BMP images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_decode, assert_channels]):
-        return gen_image_ops.decode_bmp(contents)
+        return convert_image_dtype(gen_image_ops.decode_bmp(contents), dtype)
 
     def _gif():
       # Create assert to make sure that channels is not set to 1
@@ -1614,7 +1615,7 @@ def decode_image(contents, channels=None, name=None):
       channels_msg = 'Channels must be in (None, 0, 3) when decoding GIF images'
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_gif(contents)
+        return convert_image_dtype(gen_image_ops.decode_gif(contents), dtype)
 
     def check_gif():
       # Create assert op to check that bytes are GIF decodable
@@ -1623,7 +1624,11 @@ def decode_image(contents, channels=None, name=None):
 
     def _png():
       """Decodes a PNG image."""
-      return gen_image_ops.decode_png(contents, channels)
+      return convert_image_dtype(
+          gen_image_ops.decode_png(contents, channels,
+                                   dtype=dtypes.uint8
+                                   if dtype == dtypes.uint8
+                                   else dtypes.uint16), dtype)
 
     def check_png():
       """Checks if an image is PNG."""
@@ -1639,7 +1644,8 @@ def decode_image(contents, channels=None, name=None):
                       'images')
       assert_channels = control_flow_ops.Assert(good_channels, [channels_msg])
       with ops.control_dependencies([assert_channels]):
-        return gen_image_ops.decode_jpeg(contents, channels)
+        return convert_image_dtype(
+            gen_image_ops.decode_jpeg(contents, channels), dtype)
 
     # Decode normal JPEG images (start with \xff\xd8\xff\xe0)
     # as well as JPEG images with EXIF data (start with \xff\xd8\xff\xe1).
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index d50ff3fb60..ae45037c17 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -3888,5 +3888,88 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
       self.assertAllClose(expected_batch, actual_sobel)
 
 
+class DecodeImageTest(test_util.TensorFlowTestCase):
+
+  def testJpegUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpUint16(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.uint16)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testJpegFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/jpeg/testdata"
+      jpeg0 = io_ops.read_file(os.path.join(base, "jpeg_merge_test1.jpg"))
+      image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testPngFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/png/testdata"
+      png0 = io_ops.read_file(os.path.join(base, "lena_rgba.png"))
+      image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(
+          image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testGifFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/gif/testdata"
+      gif0 = io_ops.read_file(os.path.join(base, "scan.gif"))
+      image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+  def testBmpFloat32(self):
+    with self.test_session(use_gpu=True) as sess:
+      base = "tensorflow/core/lib/bmp/testdata"
+      bmp0 = io_ops.read_file(os.path.join(base, "lena.bmp"))
+      image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
+      image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
+                                             dtypes.float32)
+      image0, image1 = sess.run([image0, image1])
+      self.assertAllEqual(image0, image1)
+
+
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 87543e374b..32fb9183e6 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -54,7 +54,7 @@ tf_module {
   }
   member_method {
     name: "decode_image"
-    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
   }
   member_method {
     name: "decode_jpeg"
-- 
GitLab


From 46147d8ca303e29fd15612afdb906b5220af5d3f Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Fri, 8 Jun 2018 10:33:48 -0700
Subject: [PATCH 488/610] Increase relative error to 1e-4 on convolution_test.
 convolution_test had a zero relative error bound which made it overly
 sensitive to changes to the underlying computation.

PiperOrigin-RevId: 199814523
---
 tensorflow/compiler/xla/tests/convolution_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 947959beb1..346bb3a399 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -47,9 +47,9 @@ class ConvolutionTest : public ClientLibraryTestBase {
 #if XLA_TEST_BACKEND_GPU
   // XLA:GPU sometimes uses FFT convolution which isn't as precise as spatial
   // convolution. So relax the absolute error threshold.
-  ErrorSpec error_spec_ = ErrorSpec(1e-2);
+  ErrorSpec error_spec_ = ErrorSpec(1e-2, 1e-4);
 #else
-  ErrorSpec error_spec_ = ErrorSpec(1e-4);
+  ErrorSpec error_spec_ = ErrorSpec(1e-4, 1e-4);
 #endif
 };
 
-- 
GitLab


From 255a1c4e5d345710a8d734c0a0dfbbf728675b95 Mon Sep 17 00:00:00 2001
From: Francois Chollet <fchollet@google.com>
Date: Fri, 8 Jun 2018 10:52:33 -0700
Subject: [PATCH 489/610] Preserve input shape information when serializing
 deferred-build Sequential models.

PiperOrigin-RevId: 199817660
---
 tensorflow/python/keras/engine/sequential.py  |  7 +++++-
 .../python/keras/engine/sequential_test.py    | 24 +++++++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 52e29b0ffa..3ca8fdd326 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -222,11 +222,16 @@ class Sequential(Model):
       for layer in self._layers:
         x = layer(x)
       self.outputs = [x]
+      # Make sure that the model's input shape will be preserved during
+      # serialization.
+      if self._layers:
+        self._layers[0]._batch_input_shape = batch_shape
 
     if self.inputs:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
-    self._track_layers(self._layers)
+    if self._layers:
+      self._track_layers(self._layers)
 
   def predict_proba(self, x, batch_size=32, verbose=0):
     """Generates class probability predictions for the input samples.
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index 69a288e69b..cdaf9162de 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -209,6 +209,30 @@ class TestSequential(test.TestCase):
       x2 = model.predict(val_a)
       assert np.abs(np.sum(x1 - x2)) > 1e-5
 
+  def test_sequential_deferred_build_serialization(self):
+    num_hidden = 5
+    input_dim = 3
+    batch_size = 5
+    num_classes = 2
+
+    model = keras.models.Sequential()
+    # We don't specify the input shape.
+    model.add(keras.layers.Dense(num_hidden))
+    model.add(keras.layers.Dense(num_classes))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    self.assertFalse(model.built)
+
+    x = np.random.random((batch_size, input_dim))
+    y = np.random.random((batch_size, num_classes))
+    model.train_on_batch(x, y)
+    self.assertTrue(model.built)
+
+    config = model.get_config()
+    new_model = keras.models.Sequential.from_config(config)
+    self.assertTrue(new_model.built)
+    self.assertEqual(len(model.layers), 2)
+    self.assertEqual(len(model.weights), 4)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From d33c12188f09d49c2bf0c912702836071ffcc5ae Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Fri, 8 Jun 2018 13:59:39 -0400
Subject: [PATCH 490/610] Update RELEASE.md for tfdbg bug fix in 1.9.0 (#19846)

---
 RELEASE.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/RELEASE.md b/RELEASE.md
index 18e5dfb16e..e09e9c6190 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -22,7 +22,7 @@
   * `tf.keras.Model.save_weights` now saves in TensorFlow format by default.
   * Enable dataset iterators to be passed to `tf.keras.Model` training/eval methods.
 * Accelerated Linear Algebra (XLA):
-* TensorFlow Debugger (tfdbg) CLI:
+* TensorFlow Debugger (tfdbg): fix an issue in which the TensorBoard Debugger Plugin could not handle total source file size exceeding gRPC message size limit (4 MB).
 * `tf.contrib`:
   * Add `tf.contrib.data.choose_from_datasets()`.
   * `tf.contrib.data.make_csv_dataset()` now supports line breaks in quoted strings. Two arguments were removed from `make_csv_dataset`.
-- 
GitLab


From e8ca21f1533361aaad5acf1738239266b95dae12 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 11:15:20 -0700
Subject: [PATCH 491/610] Split out opcodes using dimensions_ as subclasses
 from HloInstruction.

PiperOrigin-RevId: 199821675
---
 .../compiler/xla/service/hlo_instruction.cc   | 237 ++++++---------
 .../compiler/xla/service/hlo_instruction.h    |  42 ++-
 .../compiler/xla/service/hlo_instructions.cc  | 272 ++++++++++++++++++
 .../compiler/xla/service/hlo_instructions.h   | 170 +++++++++++
 4 files changed, 553 insertions(+), 168 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index b6e2056600..ae230d2740 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -66,6 +66,9 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   const auto operands = [&instruction_map, &proto](int index) {
     return instruction_map.at(proto.operand_ids(index));
   };
+  const auto computations = [&computation_map, &proto](int index) {
+    return computation_map.at(proto.called_computation_ids(index));
+  };
   switch (opcode) {
     // Ops migrated to subclasses.
     case HloOpcode::kBatchNormTraining:
@@ -111,6 +114,57 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       CHECK_EQ(proto.operand_ids_size(), 1);
       instruction = CreateRecvDone(operands(0));
       break;
+    case HloOpcode::kReverse:
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      instruction = CreateReverse(proto.shape(), operands(0),
+                                  std::vector<int64>(proto.dimensions().begin(),
+                                                     proto.dimensions().end()));
+      break;
+    case HloOpcode::kConcatenate: {
+      CHECK_EQ(proto.dimensions_size(), 1);
+      std::vector<HloInstruction*> concat_operands(proto.operand_ids_size());
+      std::transform(proto.operand_ids().begin(), proto.operand_ids().end(),
+                     concat_operands.begin(),
+                     [&instruction_map](int64 operand_id) {
+                       return instruction_map.at(operand_id);
+                     });
+      instruction = CreateConcatenate(proto.shape(), concat_operands,
+                                      proto.dimensions(0));
+      break;
+    }
+    case HloOpcode::kReduce:
+      CHECK_EQ(proto.operand_ids_size(), 2);
+      CHECK_EQ(proto.called_computation_ids_size(), 1);
+      instruction = CreateReduce(proto.shape(), operands(0), operands(1),
+                                 std::vector<int64>(proto.dimensions().begin(),
+                                                    proto.dimensions().end()),
+                                 computations(0));
+      break;
+    case HloOpcode::kTranspose:
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      instruction =
+          CreateTranspose(proto.shape(), operands(0),
+                          std::vector<int64>(proto.dimensions().begin(),
+                                             proto.dimensions().end()));
+      break;
+    case HloOpcode::kBroadcast:
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      instruction =
+          CreateBroadcast(proto.shape(), operands(0),
+                          std::vector<int64>(proto.dimensions().begin(),
+                                             proto.dimensions().end()));
+      break;
+    case HloOpcode::kMap: {
+      CHECK_EQ(proto.called_computation_ids_size(), 1);
+      std::vector<HloInstruction*> map_operands(proto.operand_ids_size());
+      std::transform(proto.operand_ids().begin(), proto.operand_ids().end(),
+                     map_operands.begin(),
+                     [&instruction_map](int64 operand_id) {
+                       return instruction_map.at(operand_id);
+                     });
+      instruction = CreateMap(proto.shape(), map_operands, computations(0));
+      break;
+    }
     default: {
       instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -124,6 +178,14 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         TF_RETURN_IF_ERROR(instruction_map.at(predecessor_id)
                                ->AddControlDependencyTo(instruction.get()));
       }
+      if (instruction->opcode() != HloOpcode::kFusion) {
+        for (const int64 computation_id : proto.called_computation_ids()) {
+          TF_RET_CHECK(ContainsKey(computation_map, computation_id))
+              << "No computation with id " << computation_id;
+          instruction->called_computations_.push_back(
+              computation_map.at(computation_id));
+        }
+      }
       break;
     }
   }
@@ -146,13 +208,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         << "No fusion computation with id " << fusion_id;
     fused_computation->SetFusionInstruction(instruction.get());
     instruction->called_computations_.push_back(fused_computation);
-  } else {
-    for (const int64 computation_id : proto.called_computation_ids()) {
-      TF_RET_CHECK(ContainsKey(computation_map, computation_id))
-          << "No computation with id " << computation_id;
-      instruction->called_computations_.push_back(
-          computation_map.at(computation_id));
-    }
   }
 
   if (instruction->opcode() == HloOpcode::kTrace) {
@@ -174,9 +229,6 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   instruction->parameter_number_ = proto.parameter_number();
 
   instruction->tuple_index_ = proto.tuple_index();
-  for (int64 dimension : proto.dimensions()) {
-    instruction->dimensions_.push_back(dimension);
-  }
   if (proto.has_window()) {
     instruction->window_ = MakeUnique<Window>(proto.window());
   }
@@ -392,18 +444,8 @@ HloInstruction::CreateGetTupleElement(const Shape& shape,
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     HloComputation* map_computation,
     tensorflow::gtl::ArraySlice<HloInstruction*> static_operands) {
-  CHECK(static_operands.empty()) << "static_operands not yet supported";
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kMap, shape));
-  for (auto operand : operands) {
-    instruction->AppendOperand(operand);
-  }
-  instruction->called_computations_.push_back(map_computation);
-  // TODO(b/65689298) Remove code below once Map is generalized to accept
-  // arbitrary map dimensions.
-  instruction->dimensions_.resize(ShapeUtil::Rank(shape));
-  std::iota(instruction->dimensions_.begin(), instruction->dimensions_.end(),
-            0);
-  return instruction;
+  return MakeUnique<HloMapInstruction>(shape, operands, map_computation,
+                                       static_operands);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConvolve(
@@ -538,10 +580,7 @@ HloInstruction::CreateCrossReplicaSum(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReverse(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReverse, shape));
-  instruction->AppendOperand(operand);
-  instruction->dimensions_.assign(dimensions.begin(), dimensions.end());
-  return instruction;
+  return MakeUnique<HloReverseInstruction>(shape, operand, dimensions);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateWhile(
@@ -619,13 +658,7 @@ HloInstruction::CreateDynamicUpdateSlice(const Shape& shape,
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConcatenate(
     const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
     int64 dimension) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kConcatenate, shape));
-  for (auto operand : operands) {
-    instruction->AppendOperand(operand);
-  }
-  instruction->dimensions_.push_back(dimension);
-  return instruction;
+  return MakeUnique<HloConcatenateInstruction>(shape, operands, dimension);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConvert(
@@ -648,13 +681,8 @@ HloInstruction::CreateBitcastConvert(const Shape& shape,
     const Shape& shape, HloInstruction* arg, HloInstruction* init_value,
     tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
     HloComputation* reduce_computation) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kReduce, shape));
-  instruction->AppendOperand(arg);
-  instruction->AppendOperand(init_value);
-  instruction->dimensions_.assign(dimensions_to_reduce.begin(),
-                                  dimensions_to_reduce.end());
-  instruction->called_computations_.push_back(reduce_computation);
-  return instruction;
+  return MakeUnique<HloReduceInstruction>(
+      shape, arg, init_value, dimensions_to_reduce, reduce_computation);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReduceWindow(
@@ -719,12 +747,8 @@ HloInstruction::CreateSelectAndScatter(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateBroadcast(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kBroadcast, shape));
-  instruction->AppendOperand(operand);
-  instruction->dimensions_.assign(broadcast_dimensions.begin(),
-                                  broadcast_dimensions.end());
-  return instruction;
+  return MakeUnique<HloBroadcastInstruction>(shape, operand,
+                                             broadcast_dimensions);
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -803,19 +827,7 @@ HloInstruction::CreateBroadcastSequence(
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTranspose(
     const Shape& shape, HloInstruction* operand,
     tensorflow::gtl::ArraySlice<int64> dimensions) {
-  CHECK_EQ(shape.dimensions().size(), dimensions.size());
-  CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size());
-  CHECK(std::equal(operand->shape().dimensions().begin(),
-                   operand->shape().dimensions().end(),
-                   Permute(dimensions, shape.dimensions()).begin()))
-      << "shape: " << ShapeUtil::HumanString(shape)
-      << ", operand->shape(): " << ShapeUtil::HumanString(shape)
-      << ", dimensions: {" << Join(dimensions, ", ") << "}";
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kTranspose, shape));
-  instruction->AppendOperand(operand);
-  instruction->dimensions_.assign(dimensions.begin(), dimensions.end());
-  return instruction;
+  return MakeUnique<HloTransposeInstruction>(shape, operand, dimensions);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateFusion(
@@ -1293,6 +1305,12 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
+    case HloOpcode::kReverse:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kReduce:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kMap:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1353,10 +1371,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
                             new_operands[2]);
       break;
     // Other supported ops.
-    case HloOpcode::kBroadcast:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateBroadcast(shape, new_operands[0], dimensions_);
-      break;
     case HloOpcode::kCall:
       clone = CreateCall(shape, new_operands, to_apply());
       break;
@@ -1375,9 +1389,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateHostCompute(shape, new_operands, channel_name_,
                                 cost_estimate_ns_);
       break;
-    case HloOpcode::kConcatenate:
-      clone = CreateConcatenate(shape, new_operands, dimensions(0));
-      break;
     case HloOpcode::kConvert:
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateConvert(shape, new_operands[0]);
@@ -1408,19 +1419,11 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateGetTupleElement(shape, new_operands[0], tuple_index());
       break;
-    case HloOpcode::kMap:
-      clone = CreateMap(shape, new_operands, to_apply());
-      break;
     case HloOpcode::kPad:
       CHECK_EQ(new_operands.size(), 2);
       clone =
           CreatePad(shape, new_operands[0], new_operands[1], *padding_config_);
       break;
-    case HloOpcode::kReduce:
-      CHECK_EQ(new_operands.size(), 2);
-      clone = CreateReduce(shape, new_operands[0], new_operands[1], dimensions_,
-                           to_apply());
-      break;
     case HloOpcode::kReduceWindow:
       CHECK_EQ(new_operands.size(), 2);
       clone = CreateReduceWindow(shape, new_operands[0], new_operands[1],
@@ -1432,10 +1435,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
           CreateSelectAndScatter(shape, new_operands[0], select(), *window_,
                                  new_operands[1], new_operands[2], scatter());
       break;
-    case HloOpcode::kReverse:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateReverse(shape, new_operands[0], dimensions_);
-      break;
     case HloOpcode::kRng:
       clone = CreateRng(shape, distribution_, new_operands);
       break;
@@ -1457,10 +1456,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone = CreateDynamicUpdateSlice(shape, new_operands[0], new_operands[1],
                                        new_operands[2]);
       break;
-    case HloOpcode::kTranspose:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateTranspose(shape, new_operands[0], dimensions_);
-      break;
     case HloOpcode::kTuple:
       clone = CreateTuple(new_operands);
       *clone->mutable_shape() = shape;
@@ -1606,28 +1601,6 @@ const Literal& HloInstruction::literal() const {
 
 bool HloInstruction::HasLiteral() const { return literal_ != nullptr; }
 
-bool HloInstruction::CanHaveDimensionsField() const {
-  return (opcode() == HloOpcode::kReverse ||
-          opcode() == HloOpcode::kConcatenate || opcode() == HloOpcode::kMap ||
-          opcode() == HloOpcode::kReduce || opcode() == HloOpcode::kBroadcast ||
-          opcode() == HloOpcode::kTranspose);
-}
-
-const std::vector<int64>& HloInstruction::dimensions() const {
-  CHECK(CanHaveDimensionsField());
-  return dimensions_;
-}
-
-int64 HloInstruction::dimensions(int64 index) const {
-  return dimensions()[index];
-}
-
-int64 HloInstruction::concatenate_dimension() const {
-  CHECK(opcode() == HloOpcode::kConcatenate);
-  CHECK_EQ(1, dimensions_.size());
-  return dimensions(0);
-}
-
 int64 HloInstruction::tuple_index() const {
   CHECK_EQ(HloOpcode::kGetTupleElement, opcode_);
   return tuple_index_;
@@ -1793,12 +1766,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTuple:
       return true;
 
-    // Broadcast, Concatenate, and Transpose need the same dimensions field.
-    case HloOpcode::kBroadcast:
-    case HloOpcode::kConcatenate:
-    case HloOpcode::kTranspose:
-      return dimensions() == other.dimensions();
-
     case HloOpcode::kFusion:
       return fusion_kind() == other.fusion_kind() &&
              eq_computations(fused_instructions_computation(),
@@ -1839,11 +1806,6 @@ bool HloInstruction::IdenticalSlowPath(
                                            other.gather_dimension_numbers()) &&
              gather_window_bounds() == other.gather_window_bounds();
 
-    // Reduction results are determined by the reduction dimension and the
-    // reduction computation.
-    case HloOpcode::kReduce:
-      return dimensions() == other.dimensions() &&
-             eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kReduceWindow:
       return eq_computations(to_apply(), other.to_apply()) &&
              protobuf_util::ProtobufEquals(window(), other.window());
@@ -1867,7 +1829,6 @@ bool HloInstruction::IdenticalSlowPath(
              slice_strides_ == other.slice_strides_;
     case HloOpcode::kCall:
     case HloOpcode::kCrossReplicaSum:
-    case HloOpcode::kMap:
       return eq_computations(to_apply(), other.to_apply());
     case HloOpcode::kCustomCall:
       if ((window_ == nullptr) != (other.window_ == nullptr) ||
@@ -1884,8 +1845,6 @@ bool HloInstruction::IdenticalSlowPath(
         return false;
       }
       return custom_call_target_ == other.custom_call_target_;
-    case HloOpcode::kReverse:
-      return dimensions() == other.dimensions();
     case HloOpcode::kConditional:
       return eq_computations(true_computation(), other.true_computation()) &&
              eq_computations(false_computation(), other.false_computation());
@@ -1907,19 +1866,17 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kSendDone:
     case HloOpcode::kRecv:
     case HloOpcode::kRecvDone:
+    case HloOpcode::kReverse:
+    case HloOpcode::kConcatenate:
+    case HloOpcode::kReduce:
+    case HloOpcode::kTranspose:
+    case HloOpcode::kBroadcast:
+    case HloOpcode::kMap:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
 }
 
-bool HloInstruction::IsRank2Transpose() const {
-  return (opcode_ == HloOpcode::kTranspose) &&
-         dimensions_ == std::vector<int64>({1, 0}) &&
-         shape_.dimensions_size() == 2 &&
-         std::equal(shape_.dimensions().begin(), shape_.dimensions().end(),
-                    operands_[0]->shape_.dimensions().rbegin());
-}
-
 void HloInstruction::RemoveUser(HloInstruction* user) {
   auto set_it = user_set_.find(user);
   CHECK(set_it != user_set_.end());
@@ -2277,9 +2234,6 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
   if (opcode() == HloOpcode::kFusion) {
     extra.push_back(StrCat("kind=", xla::ToString(fusion_kind())));
   }
-  if (CanHaveDimensionsField()) {
-    extra.push_back(StrCat("dimensions={", Join(dimensions(), ","), "}"));
-  }
   if (window_ != nullptr && window_->dimensions_size() != 0) {
     extra.push_back(StrCat("window={", window_util::ToString(*window_), "}"));
   }
@@ -2477,9 +2431,6 @@ HloInstructionProto HloInstruction::ToProto() const {
   }
 
   proto.set_tuple_index(tuple_index_);
-  for (int64 dimension : dimensions_) {
-    proto.add_dimensions(dimension);
-  }
   if (window_ != nullptr) {
     *proto.mutable_window() = *window_;
   }
@@ -3157,19 +3108,6 @@ bool HloInstruction::IsElementwise() const {
     // Other operations.
     case HloOpcode::kRng:
       return true;
-    case HloOpcode::kMap:
-      if (!dimensions().empty()) {
-        // Check that the map is executed in elementwise compatible dimensions.
-        if (dimensions().size() != operand(0)->shape().dimensions_size()) {
-          return false;
-        }
-        for (int i = 0; i < dimensions().size(); ++i) {
-          if (dimensions()[i] != i) {
-            return false;
-          }
-        }
-      }
-      return true;
     case HloOpcode::kFusion:
       if (fusion_kind() != FusionKind::kLoop) {
         return false;
@@ -3608,4 +3546,13 @@ const std::vector<int64>& HloInstruction::fft_length() const {
 int64 HloInstruction::channel_id() const {
   return Cast<HloSendRecvInstruction>(this)->channel_id();
 }
+
+int64 HloInstruction::concatenate_dimension() const {
+  return Cast<HloConcatenateInstruction>(this)->concatenate_dimension();
+}
+
+bool HloInstruction::IsRank2Transpose() const {
+  auto transpose = DynCast<HloTransposeInstruction>(this);
+  return transpose != nullptr && transpose->IsRank2Transpose();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index c08806b33b..cc4a8b8252 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -802,9 +802,6 @@ class HloInstruction {
   // Returns whether the instruction has a constant operand.
   bool HasConstantOperand() const;
 
-  // Returns whether this instruction does a rank-2 transposition.
-  bool IsRank2Transpose() const;
-
   // Replaces the use of this instruction in "user" with "new_producer". Note
   // that there might be multiple uses of this instruction in "user"; all will
   // be replaced.
@@ -889,17 +886,6 @@ class HloInstruction {
     return parameter_number_;
   }
 
-  // Returns the dimension sizes or numbers associated with this instruction.
-  //
-  // Precondition: opcode() is one of: concatenate, reduce, broadcast, reshape,
-  // and reverse.
-  const std::vector<int64>& dimensions() const;
-  int64 dimensions(int64 index) const;
-
-  // Accessor for the dimension in which a concatenate HLO should occur.
-  // Precondition: opcode() == HloOpcode::kConcatenate
-  int64 concatenate_dimension() const;
-
   // Returns the tuple index associated with this instruction.
   //
   // Precondition: opcode() == HloOpcode::kGetTupleElement
@@ -1385,7 +1371,7 @@ class HloInstruction {
   bool IsElementwiseOnOperand(int64 operand_idx) const;
 
   // Returns true if this instruction is elementwise on all its operands.
-  bool IsElementwise() const;
+  virtual bool IsElementwise() const;
 
   // Returns true if this elementwise instruction implicitly broadcasts operand
   // `operand_idx`.
@@ -1521,6 +1507,20 @@ class HloInstruction {
 
   // Delegates to HloSendRecvInstruction::channel_id.
   int64 channel_id() const;
+
+  // Returns the dimension sizes or numbers associated with this instruction.
+  virtual const std::vector<int64>& dimensions() const {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+  virtual int64 dimensions(int64 index) const {
+    LOG(FATAL) << "Unimplemented method.";
+  }
+
+  // Delegates to HloConcatenateInstruction::concatenate_dimension.
+  int64 concatenate_dimension() const;
+
+  // Returns whether this instruction does a rank-2 transposition.
+  bool IsRank2Transpose() const;
   // Old methods kept for smooth subclassing transition END.
 
  protected:
@@ -1532,6 +1532,10 @@ class HloInstruction {
   // of the operand.
   void AppendOperand(HloInstruction* operand);
 
+  void AppendComputation(HloComputation* computation) {
+    called_computations_.push_back(computation);
+  }
+
  private:
   // Implementation for non-common logic of CloneWithNewOperands.
   virtual std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
@@ -1615,10 +1619,6 @@ class HloInstruction {
       const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
       HloCloneContext* context = nullptr) const;
 
-  // Returns true if this instruction can legally have the dimensions field
-  // set. Used for checking precondition of dimensions field accessors.
-  bool CanHaveDimensionsField() const;
-
   // Returns how this instruction uses elements of its `i`th operand.
   UseKind OperandElementUse(int64 i) const;
 
@@ -1662,10 +1662,6 @@ class HloInstruction {
   // Constant index, only present for kGetTupleElement.
   int64 tuple_index_ = -1;
 
-  // Dimensions present for some operations that require reshaping or
-  // broadcasting, including Reshape, Reduce, ReduceWindow, and Reverse.
-  std::vector<int64> dimensions_;
-
   // Describes the window in a windowed operation such as convolution.
   std::unique_ptr<Window> window_;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 109bf1a9bd..e987bd6d86 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -251,4 +251,276 @@ HloRecvDoneInstruction::CloneWithNewOperandsImpl(
       Cast<HloRecvInstruction>(new_operands[0]));
 }
 
+HloReverseInstruction::HloReverseInstruction(
+    const Shape& shape, HloInstruction* operand,
+    tensorflow::gtl::ArraySlice<int64> dimensions)
+    : HloInstruction(HloOpcode::kReverse, shape),
+      dimensions_(dimensions.begin(), dimensions.end()) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloReverseInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int64 dimension : dimensions_) {
+    proto.add_dimensions(dimension);
+  }
+  return proto;
+}
+
+std::vector<string> HloReverseInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+}
+
+bool HloReverseInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloReverseInstruction&>(other);
+  return dimensions() == casted_other.dimensions();
+}
+
+std::unique_ptr<HloInstruction> HloReverseInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloReverseInstruction>(shape, new_operands[0],
+                                           dimensions());
+}
+
+HloConcatenateInstruction::HloConcatenateInstruction(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    int64 dimension)
+    : HloInstruction(HloOpcode::kConcatenate, shape), dimensions_({dimension}) {
+  for (auto operand : operands) {
+    AppendOperand(operand);
+  }
+}
+
+HloInstructionProto HloConcatenateInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int64 dimension : dimensions_) {
+    proto.add_dimensions(dimension);
+  }
+  return proto;
+}
+
+std::vector<string> HloConcatenateInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+}
+
+bool HloConcatenateInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other =
+      static_cast<const HloConcatenateInstruction&>(other);
+  return dimensions() == casted_other.dimensions();
+}
+
+std::unique_ptr<HloInstruction>
+HloConcatenateInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  return MakeUnique<HloConcatenateInstruction>(shape, new_operands,
+                                               dimensions(0));
+}
+
+HloReduceInstruction::HloReduceInstruction(
+    const Shape& shape, HloInstruction* arg, HloInstruction* init_value,
+    tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
+    HloComputation* reduce_computation)
+    : HloInstruction(HloOpcode::kReduce, shape),
+      dimensions_(dimensions_to_reduce.begin(), dimensions_to_reduce.end()) {
+  AppendOperand(arg);
+  AppendOperand(init_value);
+  AppendComputation(reduce_computation);
+}
+
+HloInstructionProto HloReduceInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int64 dimension : dimensions_) {
+    proto.add_dimensions(dimension);
+  }
+  return proto;
+}
+
+std::vector<string> HloReduceInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+}
+
+bool HloReduceInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloReduceInstruction&>(other);
+  // Reduction results are determined by the reduction dimension and the
+  // reduction computation.
+  return dimensions() == casted_other.dimensions() &&
+         eq_computations(to_apply(), casted_other.to_apply());
+}
+
+std::unique_ptr<HloInstruction> HloReduceInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 2);
+  return MakeUnique<HloReduceInstruction>(
+      shape, new_operands[0], new_operands[1], dimensions(), to_apply());
+}
+
+HloTransposeInstruction::HloTransposeInstruction(
+    const Shape& shape, HloInstruction* operand,
+    tensorflow::gtl::ArraySlice<int64> dimensions)
+    : HloInstruction(HloOpcode::kTranspose, shape),
+      dimensions_(dimensions.begin(), dimensions.end()) {
+  CHECK_EQ(shape.dimensions().size(), dimensions.size());
+  CHECK_EQ(shape.dimensions().size(), operand->shape().dimensions().size());
+  CHECK(std::equal(operand->shape().dimensions().begin(),
+                   operand->shape().dimensions().end(),
+                   Permute(dimensions, shape.dimensions()).begin()))
+      << "shape: " << ShapeUtil::HumanString(shape)
+      << ", operand->shape(): " << ShapeUtil::HumanString(shape)
+      << ", dimensions: {" << Join(dimensions, ", ") << "}";
+  AppendOperand(operand);
+}
+
+bool HloTransposeInstruction::IsRank2Transpose() const {
+  return dimensions() == std::vector<int64>({1, 0}) &&
+         shape().dimensions_size() == 2 &&
+         std::equal(shape().dimensions().begin(), shape().dimensions().end(),
+                    operand(0)->shape().dimensions().rbegin());
+}
+
+HloInstructionProto HloTransposeInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int64 dimension : dimensions_) {
+    proto.add_dimensions(dimension);
+  }
+  return proto;
+}
+
+std::vector<string> HloTransposeInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+}
+
+bool HloTransposeInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloTransposeInstruction&>(other);
+  return dimensions() == casted_other.dimensions();
+}
+
+std::unique_ptr<HloInstruction>
+HloTransposeInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloTransposeInstruction>(shape, new_operands[0],
+                                             dimensions());
+}
+
+HloBroadcastInstruction::HloBroadcastInstruction(
+    const Shape& shape, HloInstruction* operand,
+    tensorflow::gtl::ArraySlice<int64> broadcast_dimension)
+    : HloInstruction(HloOpcode::kBroadcast, shape),
+      dimensions_(broadcast_dimension.begin(), broadcast_dimension.end()) {
+  AppendOperand(operand);
+}
+
+HloInstructionProto HloBroadcastInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int64 dimension : dimensions_) {
+    proto.add_dimensions(dimension);
+  }
+  return proto;
+}
+
+std::vector<string> HloBroadcastInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+}
+
+bool HloBroadcastInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloBroadcastInstruction&>(other);
+  return dimensions() == casted_other.dimensions();
+}
+
+std::unique_ptr<HloInstruction>
+HloBroadcastInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloBroadcastInstruction>(shape, new_operands[0],
+                                             dimensions());
+}
+
+HloMapInstruction::HloMapInstruction(
+    const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+    HloComputation* map_computation,
+    tensorflow::gtl::ArraySlice<HloInstruction*> static_operands)
+    : HloInstruction(HloOpcode::kMap, shape) {
+  CHECK(static_operands.empty()) << "static_operands not yet supported";
+  for (auto operand : operands) {
+    AppendOperand(operand);
+  }
+  AppendComputation(map_computation);
+  // TODO(b/65689298) Remove code below once Map is generalized to accept
+  // arbitrary map dimensions.
+  dimensions_.resize(ShapeUtil::Rank(shape));
+  std::iota(dimensions_.begin(), dimensions_.end(), 0);
+}
+
+HloInstructionProto HloMapInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int64 dimension : dimensions_) {
+    proto.add_dimensions(dimension);
+  }
+  return proto;
+}
+
+bool HloMapInstruction::IsElementwise() const {
+  if (!dimensions().empty()) {
+    // Check that the map is executed in elementwise compatible dimensions.
+    if (dimensions().size() != shape().dimensions_size()) {
+      return false;
+    }
+    for (int i = 0; i < dimensions().size(); ++i) {
+      if (dimensions()[i] != i) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+std::vector<string> HloMapInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  return {StrCat("dimensions={", Join(dimensions(), ","), "}")};
+}
+
+bool HloMapInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  return eq_computations(to_apply(), other.to_apply());
+}
+
+std::unique_ptr<HloInstruction> HloMapInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  return MakeUnique<HloMapInstruction>(shape, new_operands, to_apply());
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 22d2fe6b27..c8c34f3406 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -207,6 +207,176 @@ class HloRecvDoneInstruction : public HloSendRecvInstruction {
       HloCloneContext* context) const override;
 };
 
+class HloReverseInstruction : public HloInstruction {
+ public:
+  explicit HloReverseInstruction(const Shape& shape, HloInstruction* operand,
+                                 tensorflow::gtl::ArraySlice<int64> dimensions);
+  // Returns the dimension sizes or numbers associated with this instruction.
+  const std::vector<int64>& dimensions() const override { return dimensions_; }
+  int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64> dimensions_;
+};
+
+class HloConcatenateInstruction : public HloInstruction {
+ public:
+  explicit HloConcatenateInstruction(
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      int64 dimension);
+  // Returns the dimension sizes or numbers associated with this instruction.
+  const std::vector<int64>& dimensions() const override { return dimensions_; }
+  int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  // Accessor for the dimension in which a concatenate HLO should occur.
+  int64 concatenate_dimension() const { return dimensions(0); }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64> dimensions_;
+};
+
+class HloReduceInstruction : public HloInstruction {
+ public:
+  explicit HloReduceInstruction(
+      const Shape& shape, HloInstruction* arg, HloInstruction* init_value,
+      tensorflow::gtl::ArraySlice<int64> dimensions_to_reduce,
+      HloComputation* reduce_computation);
+  // Returns the dimension sizes or numbers associated with this instruction.
+  const std::vector<int64>& dimensions() const override { return dimensions_; }
+  int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64> dimensions_;
+};
+
+class HloTransposeInstruction : public HloInstruction {
+ public:
+  explicit HloTransposeInstruction(
+      const Shape& shape, HloInstruction* operand,
+      tensorflow::gtl::ArraySlice<int64> dimensions);
+  // Returns the dimension sizes or numbers associated with this instruction.
+  const std::vector<int64>& dimensions() const override { return dimensions_; }
+  int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  // Returns whether this instruction does a rank-2 transposition.
+  bool IsRank2Transpose() const;
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64> dimensions_;
+};
+
+class HloBroadcastInstruction : public HloInstruction {
+ public:
+  explicit HloBroadcastInstruction(
+      const Shape& shape, HloInstruction* operand,
+      tensorflow::gtl::ArraySlice<int64> broadcast_dimension);
+  // Returns the dimension sizes or numbers associated with this instruction.
+  const std::vector<int64>& dimensions() const override { return dimensions_; }
+  int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64> dimensions_;
+};
+
+class HloMapInstruction : public HloInstruction {
+ public:
+  explicit HloMapInstruction(
+      const Shape& shape, tensorflow::gtl::ArraySlice<HloInstruction*> operands,
+      HloComputation* map_computation,
+      tensorflow::gtl::ArraySlice<HloInstruction*> static_operands = {});
+  // Returns the dimension sizes or numbers associated with this instruction.
+  const std::vector<int64>& dimensions() const override { return dimensions_; }
+  int64 dimensions(int64 index) const override { return dimensions()[index]; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+  // Returns true if this instruction is binary and elementwise.
+  bool IsElementwise() const override;
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  std::vector<int64> dimensions_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
-- 
GitLab


From ebb67e0d7da53b3b848630e63aaa80f1283d83bd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 11:18:23 -0700
Subject: [PATCH 492/610] Delete deprecated protos.

PiperOrigin-RevId: 199822232
---
 tensorflow/compiler/xla/rpc/xla_service.proto |  16 -
 tensorflow/compiler/xla/xla.proto             |  94 +----
 tensorflow/compiler/xla/xla_data.proto        | 390 ------------------
 3 files changed, 1 insertion(+), 499 deletions(-)

diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index 92eb19ec0f..551ae895e0 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -115,10 +115,6 @@ service XlaService {
       returns (ComputeConstantResponse) {
   }
 
-  // Retrieves the inferred shape for a value within a computation.
-  rpc GetLocalShape(GetLocalShapeRequest) returns (GetLocalShapeResponse) {
-  }
-
   // Requests one or more device handles from the target. The returned device
   // handles can be used to specify the device on which to execute computations
   // or transfer data.
@@ -132,18 +128,6 @@ service XlaService {
       returns (CreateChannelHandleResponse) {
   }
 
-  // Requests that the referenced computation be specialized for the provided
-  // arguments for subsequent execution. This permits things such as value
-  // specialization.
-  rpc Specialize(SpecializeRequest) returns (SpecializeResponse) {
-  }
-
-  // Modifies the provided computation so that subsequent executions
-  // will compute the provided ComputationDataHandle, rather than the
-  // last expression enqueued on that Computation.
-  rpc SetReturnValue(SetReturnValueRequest) returns (SetReturnValueResponse) {
-  }
-
   // Invokes the provided computation with the provided global data passed as
   // immutable arguments. The request contains the whole computation graph.
   // Returns global data output and execution timing.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 53ba120d21..6f07e4606b 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -225,14 +225,6 @@ message ExecutionOptions {
   repeated DeviceHandle device_handles = 5;
 }
 
-message SnapshotComputationRequest {
-  ComputationHandle computation = 1;
-}
-
-message LoadComputationSnapshotResponse {
-  ComputationHandle computation = 1;
-}
-
 message GetDeviceHandlesRequest {
   int64 device_count = 1;
 }
@@ -291,11 +283,6 @@ message ResetDeviceRequest {
 message ResetDeviceResponse {
 }
 
-message ComputationStatsRequest {
-  ComputationHandle computation = 1;
-  DebugOptions debug_options = 2;
-}
-
 message ComputationGraphStatsRequest {
   HloModuleProto computation = 1;
   DebugOptions debug_options = 2;
@@ -305,14 +292,6 @@ message ComputationStatsResponse {
   ComputationStats stats = 1;
 }
 
-message ComputationRequest {
-  string name = 1;
-}
-
-message ComputationResponse {
-  ComputationHandle computation = 1;
-}
-
 message CreateChannelHandleRequest {
 }
 
@@ -327,24 +306,6 @@ message UnregisterRequest {
 message UnregisterResponse {
 }
 
-message SetReturnValueRequest {
-  ComputationHandle computation = 1;
-  ComputationDataHandle operand = 2;
-}
-
-message SetReturnValueResponse {
-}
-
-message ExecuteRequest {
-  reserved 3, 4;
-
-  ComputationHandle computation = 1;
-  repeated GlobalDataHandle arguments = 2;
-
-  // Options that affect how XLA compiles and runs code to service this request.
-  ExecutionOptions execution_options = 5;
-}
-
 message ExecuteGraphRequest {
   HloModuleProto computation = 1;
   repeated GlobalDataHandle arguments = 2;
@@ -353,10 +314,6 @@ message ExecuteGraphRequest {
   ExecutionOptions execution_options = 3;
 }
 
-message ExecuteParallelRequest {
-  repeated ExecuteRequest requests = 1;
-}
-
 message ExecuteGraphParallelRequest {
   repeated ExecuteGraphRequest requests = 1;
 }
@@ -370,21 +327,6 @@ message ExecuteParallelResponse {
   repeated ExecuteResponse responses = 1;
 }
 
-message ExecuteAsyncRequest {
-  reserved 3, 4;
-
-  ComputationHandle computation = 1;
-  repeated GlobalDataHandle arguments = 2;
-
-  // Options that affect how XLA compiles and runs code to service this request.
-  ExecutionOptions execution_options = 6;
-}
-
-message ExecuteAsyncResponse {
-  // A handle to the execution launched asynchronously.
-  ExecutionHandle execution = 1;
-}
-
 message WaitForExecutionRequest {
   ExecutionHandle execution = 1;
 }
@@ -394,31 +336,13 @@ message WaitForExecutionResponse {
   ExecutionProfile profile = 2;
 }
 
-message IsConstantRequest {
-  ComputationHandle computation = 1;
-  ComputationDataHandle operand = 2;
-  int64 num_parameters = 3;
-}
-
-message IsConstantResponse {
-  bool is_constant = 1;
-}
-
-message ComputeConstantRequest {
-  ComputationHandle computation = 1;
-  ComputationDataHandle operand = 2;
-  Layout output_layout = 3;
-  repeated LiteralProto parameters = 4;
-}
-
 message ComputeConstantGraphRequest {
   HloModuleProto computation = 1;
   Layout output_layout = 2;
 }
 
 message ComputeConstantResponse {
-  // A LiteralProto is returned directly for this request, instead of a
-  // ComputationDataHandle.
+  // A LiteralProto is returned directly for this request.
   LiteralProto literal = 1;
 }
 
@@ -460,14 +384,6 @@ message LoadDataResponse {
   int64 nanoseconds = 5;
 }
 
-message SpecializeRequest {
-  ComputationHandle computation = 1;
-  repeated GlobalDataHandle arguments = 2;
-}
-
-message SpecializeResponse {
-}
-
 message GetShapeRequest {
   GlobalDataHandle data = 1;
 }
@@ -476,14 +392,6 @@ message GetShapeResponse {
   Shape shape = 1;
 }
 
-message GetComputationShapeRequest {
-  ComputationHandle computation = 1;
-}
-
-message GetComputationShapeResponse {
-  ProgramShape program_shape = 1;
-}
-
 message UnpackRequest {
   GlobalDataHandle data = 1;
 }
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 6bdfb0179c..963d3836ed 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -276,12 +276,6 @@ message ExecutionProfile {
   int64 compute_and_transfer_time_ns = 5;
 }
 
-// Handle given to a user that represents a computation that the user builds up
-// before execution.
-message ComputationHandle {
-  int64 handle = 1;
-}
-
 // Handle given to a user that represents an execution that the user launched
 // asynchronously on the device.
 message ExecutionHandle {
@@ -295,13 +289,6 @@ message GlobalDataHandle {
   int64 handle = 1;
 }
 
-// Handle given to a user that represents a data result in a computation.
-// This is used to pass to subsequent computations that depends upon the data as
-// an operand.
-message ComputationDataHandle {
-  int64 handle = 1;
-}
-
 // Handle given to a user that represents a replicated virtual device. Each
 // replicated device represents N physical devices for execution where N is the
 // number of replicas.
@@ -441,44 +428,6 @@ message GatherDimensionNumbers {
   int64 index_vector_dim = 4;
 }
 
-// Operation requests that are all collected as a tagged union with a oneof
-// field in OpRequest.
-
-message ConstantRequest {
-  LiteralProto literal = 2;
-}
-
-message GetTupleElementRequest {
-  ComputationDataHandle operand = 2;
-  int64 index = 3;
-}
-
-message SliceRequest {
-  ComputationDataHandle operand = 2;
-  repeated int64 start_indices = 3;
-  repeated int64 limit_indices = 4;
-  repeated int64 strides = 5;
-}
-
-message DynamicSliceRequest {
-  // Operand from which to slice at dynamic 'start_indices'.
-  ComputationDataHandle operand = 2;
-  // Dynamically computed 'start_indices' for slice operation.
-  ComputationDataHandle start_indices = 3;
-  // Slice sizes for each dimension (note that indices calculations are computed
-  // modulo dimension sizes to avoid out-of-bound array accesses).
-  repeated int64 slice_sizes = 4;
-}
-
-message DynamicUpdateSliceRequest {
-  // Operand on which slice 'update' is to be applied.
-  ComputationDataHandle operand = 2;
-  // The slice update to apply to 'operand'.
-  ComputationDataHandle update = 3;
-  // Dynamically computed start indices for the update slice operation.
-  ComputationDataHandle start_indices = 4;
-}
-
 message ConvolutionDimensionNumbers {
   // The number of the dimension that represents batch in the input.
   int64 input_batch_dimension = 7;
@@ -516,13 +465,6 @@ message ConvolutionDimensionNumbers {
   // Next = 13
 };
 
-message ConvolveRequest {
-  ComputationDataHandle lhs = 2;
-  ComputationDataHandle rhs = 3;  // This is the filter/kernel.
-  Window window = 4;              // Describes the filter/kernel.
-  ConvolutionDimensionNumbers dimension_numbers = 5;
-}
-
 enum FftType {
   FFT = 0;    // Forward FFT; complex in, complex out.
   IFFT = 1;   // Inverse FFT; complex in, complex out.
@@ -531,56 +473,6 @@ enum FftType {
               //                   fft_length real out
 }
 
-message FftRequest {
-  FftType fft_type = 1;
-  repeated int64 fft_length = 2;  // Multivalent for higher-order FFT.
-  ComputationDataHandle operand = 3;
-}
-
-message InfeedRequest {
-  // The shape of the data returned by reading the device's infeed buffer.
-  Shape shape = 2;
-
-  // Additional infeed configuration for the backend.
-  bytes config = 3;
-}
-
-message OutfeedRequest {
-  // The shape of the data returned by reading the device's outfeed buffer.
-  Shape shape = 1;
-
-  // Operand to the Outfeed. Supports tuple.
-  ComputationDataHandle operand = 2;
-
-  // Backend-specific information for how to perform the outfeed.
-  bytes outfeed_config = 3;
-}
-
-message CallRequest {
-  ComputationHandle to_apply = 2;
-  repeated ComputationDataHandle operands = 3;
-}
-
-message CustomCallRequest {
-  string call_target_name = 2;
-  repeated ComputationDataHandle operands = 3;
-  Shape shape = 4;
-}
-
-message HostComputeRequest {
-  // Operand to the HostCompute. Supports tuple.
-  repeated ComputationDataHandle operands = 1;
-
-  // Name used to identify HostSend/Recv channels.
-  string channel_name = 2;
-
-  // Cost estimate in nanoseconds.
-  int64 cost_estimate_ns = 3;
-
-  // The shape of any data returned by host.
-  Shape shape = 4;
-}
-
 message DotDimensionNumbers {
   // The dimension numbers that represent the 'lhs' contracting dimensions.
   repeated int64 lhs_contracting_dimensions = 1;
@@ -592,179 +484,6 @@ message DotDimensionNumbers {
   repeated int64 rhs_batch_dimensions = 4;
 };
 
-message DotRequest {
-  ComputationDataHandle lhs = 2;
-  ComputationDataHandle rhs = 3;
-  DotDimensionNumbers dimension_numbers = 4;
-}
-
-message MapRequest {
-  repeated ComputationDataHandle operands = 2;
-  ComputationHandle to_apply = 3;
-  repeated ComputationDataHandle static_operands = 4;
-  // The dimensions over which to map.
-  // Example mapping a Dot operation along the batch dimension 0:
-  //   operand0.shape = [2, 2, 2], operand1.shape = [2,2,3]
-  //   Map({operand0, operand1}, Dot, {0})
-  repeated int64 dimensions = 5;
-}
-
-message ReduceRequest {
-  // Operand to the reduction.
-  ComputationDataHandle operand = 2;
-
-  // Initial value for the reduction. This must be consistent with the result
-  // shape of to_apply.
-  ComputationDataHandle init_value = 3;
-
-  // The dimensions to reduce over.
-  repeated int64 dimensions = 4;
-
-  // The computation to apply in the reduction.
-  ComputationHandle to_apply = 5;
-}
-
-message ReduceWindowRequest {
-  ComputationDataHandle operand = 2;
-  ComputationDataHandle init_value = 3;
-  Window window = 4;
-  ComputationHandle to_apply = 5;
-}
-
-message BatchNormTrainingRequest {
-  ComputationDataHandle operand = 1;
-  ComputationDataHandle scale = 2;
-  ComputationDataHandle offset = 3;
-  float epsilon = 4;
-  int64 feature_index = 5;
-}
-
-message BatchNormInferenceRequest {
-  ComputationDataHandle operand = 1;
-  ComputationDataHandle scale = 2;
-  ComputationDataHandle offset = 3;
-  ComputationDataHandle mean = 4;
-  ComputationDataHandle variance = 5;
-  float epsilon = 6;
-  int64 feature_index = 7;
-}
-
-message BatchNormGradRequest {
-  ComputationDataHandle operand = 1;
-  ComputationDataHandle scale = 2;
-  ComputationDataHandle mean = 3;
-  ComputationDataHandle variance = 4;
-  ComputationDataHandle grad_output = 5;
-  float epsilon = 6;
-  int64 feature_index = 7;
-}
-
-message CrossReplicaSumRequest {
-  ComputationDataHandle operand = 2;
-}
-
-message SelectAndScatterRequest {
-  // Operand array on which the windows slide.
-  ComputationDataHandle operand = 2;
-
-  // Source array for the data to scatter.
-  ComputationDataHandle source = 3;
-
-  // Initial scalar value for each element in the output.
-  ComputationDataHandle init_value = 4;
-
-  // Window configuration.
-  Window window = 5;
-
-  // Binary function used to select an element from each window.
-  ComputationHandle select = 6;
-
-  // Binary function used to combine each scattered value from source with the
-  // current output value at the selected location.
-  ComputationHandle scatter = 7;
-}
-
-message ReverseRequest {
-  ComputationDataHandle operand = 2;
-  repeated int64 dimensions = 3;
-}
-
-message BroadcastRequest {
-  ComputationDataHandle operand = 2;
-  repeated int64 broadcast_sizes = 3;
-}
-
-message PadRequest {
-  ComputationDataHandle operand = 2;
-  ComputationDataHandle padding_value = 3;
-  PaddingConfig padding_config = 4;
-}
-
-message ReshapeRequest {
-  ComputationDataHandle operand = 2;
-
-  // The dimension order for collapse (from fastest-changing to slowest).
-  repeated int64 dimensions = 3;
-
-  // The new dimension sizes (from dimension 0 to n-1).
-  repeated int64 new_sizes = 4;
-}
-
-message TransposeRequest {
-  ComputationDataHandle operand = 2;
-
-  // The permutation of the operand's dimensions (in the range 0 to n-1).
-  repeated int64 dimensions = 3;
-}
-
-message ParameterRequest {
-  Shape shape = 2;
-  int64 parameter = 3;
-  string name = 4;
-}
-
-message GetLocalShapeRequest {
-  ComputationHandle computation = 1;
-  ComputationDataHandle operand = 2;
-}
-
-message GetLocalShapeResponse {
-  Shape shape = 1;
-}
-
-message TraceRequest {
-  string tag = 2;
-  ComputationDataHandle operand = 3;
-}
-
-message ConvertRequest {
-  ComputationDataHandle operand = 2;
-  PrimitiveType new_element_type = 3;
-}
-
-message ConcatenateRequest {
-  repeated ComputationDataHandle operands = 2;
-  // The dimension in which we concatenate; e.g. if you had dimension arrays of
-  // [4, 1] and [5, 1], you'd concatenate in dimension 0 to produce a [9, 1].
-  // Attempting to concatenate those in dimension 1 would produce an error, as
-  // 4 != 5 (and there is no ragged array support).
-  int64 dimension = 3;
-}
-
-message ConditionalRequest {
-  ComputationDataHandle predicate = 2;
-  ComputationDataHandle true_operand = 3;
-  ComputationHandle true_computation = 4;
-  ComputationDataHandle false_operand = 5;
-  ComputationHandle false_computation = 6;
-}
-
-message WhileRequest {
-  ComputationHandle condition = 2;
-  ComputationHandle body = 3;
-  ComputationDataHandle init = 4;
-}
-
 enum UnaryOperation {
   UNOP_INVALID = 0;
 
@@ -827,11 +546,6 @@ enum UnaryOperation {
   UNOP_LOG1P = 19;
 }
 
-message UnaryOpRequest {
-  UnaryOperation unop = 2;
-  ComputationDataHandle operand = 3;
-}
-
 enum BinaryOperation {
   BINOP_INVALID = 0;
 
@@ -876,13 +590,6 @@ enum BinaryOperation {
   BINOP_ATAN2 = 24;
 }
 
-message BinaryOpRequest {
-  BinaryOperation binop = 2;
-  ComputationDataHandle lhs = 3;
-  ComputationDataHandle rhs = 4;
-  repeated int64 broadcast_dimensions = 5;
-}
-
 enum RandomDistribution {
   RNG_INVALID = 0;
 
@@ -897,12 +604,6 @@ enum RandomDistribution {
   // Next: 4
 }
 
-message RngRequest {
-  RandomDistribution distribution = 2;
-  repeated ComputationDataHandle parameter = 3;
-  Shape shape = 4;
-}
-
 enum TernaryOperation {
   TRIOP_INVALID = 0;
 
@@ -916,13 +617,6 @@ enum TernaryOperation {
   TRIOP_CLAMP = 3;
 }
 
-message TernaryOpRequest {
-  TernaryOperation triop = 2;
-  ComputationDataHandle lhs = 3;
-  ComputationDataHandle rhs = 4;
-  ComputationDataHandle ehs = 5;
-}
-
 enum VariadicOperation {
   VAROP_INVALID = 0;
 
@@ -930,34 +624,6 @@ enum VariadicOperation {
   VAROP_TUPLE = 1;
 }
 
-message VariadicOpRequest {
-  VariadicOperation varop = 2;
-  repeated ComputationDataHandle operands = 3;
-}
-
-message ReducePrecisionRequest {
-  ComputationDataHandle operand = 1;
-  int32 exponent_bits = 2;
-  int32 mantissa_bits = 3;
-}
-
-message SendRequest {
-  ComputationDataHandle operand = 1;
-  ChannelHandle channel_handle = 2;
-}
-
-message RecvRequest {
-  Shape shape = 1;
-  ChannelHandle channel_handle = 2;
-}
-
-message GatherRequest {
-  ComputationDataHandle input = 1;
-  ComputationDataHandle gather_indices = 2;
-  GatherDimensionNumbers dimension_numbers = 3;
-  repeated int64 window_bounds = 4;
-}
-
 message OpSharding {
   enum Type {
     // This sharding is replicated across all devices (implies maximal,
@@ -988,59 +654,3 @@ message OpSharding {
   // to.
   repeated OpSharding tuple_shardings = 5;
 }
-
-message OpRequest {
-  ComputationHandle computation = 1;
-  OpMetadata metadata = 33;
-  OpSharding sharding = 40;
-
-  oneof op {
-    BinaryOpRequest binary_op_request = 2;
-    BroadcastRequest broadcast_request = 3;
-    CallRequest call_request = 4;
-    ConcatenateRequest concatenate_request = 5;
-    ConstantRequest constant_request = 6;
-    ConvertRequest convert_request = 7;
-    ConvolveRequest convolve_request = 8;
-    CrossReplicaSumRequest cross_replica_sum_request = 9;
-    CustomCallRequest custom_call_request = 10;
-    DotRequest dot_request = 43;
-    DynamicSliceRequest dynamic_slice_request = 11;
-    DynamicUpdateSliceRequest dynamic_update_slice_request = 12;
-    GetTupleElementRequest get_tuple_element_request = 13;
-    InfeedRequest infeed_request = 14;
-    MapRequest map_request = 15;
-    PadRequest pad_request = 16;
-    ParameterRequest parameter_request = 17;
-    ReducePrecisionRequest reduce_precision_request = 36;
-    ReduceRequest reduce_request = 18;
-    ReduceWindowRequest reduce_window_request = 19;
-    ReshapeRequest reshape_request = 20;
-    ReverseRequest reverse_request = 21;
-    RngRequest rng_request = 22;
-    SelectAndScatterRequest select_and_scatter_request = 23;
-    SliceRequest slice_request = 24;
-    TernaryOpRequest ternary_op_request = 25;
-    TraceRequest trace_request = 26;
-    TransposeRequest transpose_request = 34;
-    UnaryOpRequest unary_op_request = 27;
-    VariadicOpRequest variadic_op_request = 28;
-    WhileRequest while_request = 29;
-    SendRequest send_request = 30;
-    RecvRequest recv_request = 31;
-    OutfeedRequest outfeed_request = 32;
-    BatchNormTrainingRequest batch_norm_training_request = 35;
-    BatchNormGradRequest batch_norm_grad_request = 37;
-    BatchNormInferenceRequest batch_norm_inference_request = 38;
-    FftRequest fft_request = 41;
-    ConvertRequest bitcast_convert_request = 42;
-    ConditionalRequest conditional_request = 44;
-    HostComputeRequest host_compute_request = 45;
-    GatherRequest gather_request = 46;
-    // Next: 47
-  }
-}
-
-message OpResponse {
-  ComputationDataHandle output = 1;
-}
-- 
GitLab


From 7eaf8941930c8b1a099b7ec626134b67179c07e3 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Fri, 8 Jun 2018 11:20:56 -0700
Subject: [PATCH 493/610] Use the new operators for list conversion. Includes
 list creation, append, pop, stack. Simplify the type annotation mechanism by
 having it literally copy its arguments, instead of attempting to resolve
 them.

PiperOrigin-RevId: 199822771
---
 .../contrib/autograph/converters/lists.py     | 233 +++++++++++++-----
 .../autograph/converters/lists_test.py        | 130 +++++++---
 .../pyct/static_analysis/type_info.py         |  40 ++-
 .../pyct/static_analysis/type_info_test.py    |  18 +-
 4 files changed, 291 insertions(+), 130 deletions(-)

diff --git a/tensorflow/contrib/autograph/converters/lists.py b/tensorflow/contrib/autograph/converters/lists.py
index b49521b2c3..c15dfff9e8 100644
--- a/tensorflow/contrib/autograph/converters/lists.py
+++ b/tensorflow/contrib/autograph/converters/lists.py
@@ -33,82 +33,193 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import templates
 from tensorflow.contrib.autograph.pyct import transformer
-from tensorflow.python.framework import dtypes
+from tensorflow.contrib.autograph.pyct.static_analysis.annos import NodeAnno
+
+
+# Tags for local state.
+POP_USES = 'pop_uses'
 
 
 class ListTransformer(transformer.Base):
   """Converts lists and related operations to their TF counterpart."""
 
-  def _empty_list(self, node):
-    if not anno.hasanno(node, 'element_type'):
-      raise NotImplementedError(
-          'type inference for empty lists is not yet supported; '
-          'use set_element_type(<list>, <dtype>) to continue')
-    dtype = anno.getanno(node, 'element_type')
-    if not isinstance(dtype, dtypes.DType):
-      # TODO(mdan): Allow non-TF dtypes?
-      # That would be consistent with the dynamic dispatch pattern, but
-      # we must make sure that doesn't become confusing.
-      raise NotImplementedError('element type "%s" not yet supported' % dtype)
-
-    dtype_name = dtype.name
-    # TODO(mdan): Does it ever make sense not to use tensor lists?
+  def visit_List(self, node):
+    node = self.generic_visit(node)
     template = """
-      tf.TensorArray(tf.dtype_name, size=0, dynamic_size=True)
+      ag__.new_list(elements)
     """
-    return templates.replace_as_expression(template, dtype_name=dtype_name)
+    return templates.replace_as_expression(template, elements=node)
 
-  def _pre_populated_list(self, node):
-    raise NotImplementedError('pre-populated lists')
+  def _replace_append_call(self, node):
+    assert len(node.args) == 1
+    assert isinstance(node.func, gast.Attribute)
+    template = """
+      target = ag__.list_append(target, element)
+    """
+    return templates.replace(
+        template,
+        target=node.func.value,
+        element=node.args[0])
+
+  def _replace_pop_call(self, node):
+    # Expressions that use pop() are converted to a statement + expression.
+    #
+    # For example:
+    #
+    #   print(target.pop())
+    #
+    # ... is converted to:
+    #
+    #   target, target_pop = ag__.list_pop(target)
+    #   print(target_pop)
+    #
+    # Here, we just generate the variable name and swap it in,
+    # and _generate_pop_operation will handle the rest.
+    #
+    # Multiple uses of pop() are allowed:
+    #
+    #   print(tartget.pop(), target.pop())
+    #   print(tartget.pop().pop())
+    #
+    assert isinstance(node.func, gast.Attribute)
+    scope = anno.getanno(node, NodeAnno.ARGS_SCOPE)
+    target_node = node.func.value
+
+    # Attempt to use a related name if can get one. Otherwise use something
+    # generic.
+    if anno.hasanno(target_node, anno.Basic.QN):
+      target_name = anno.getanno(target_node, anno.Basic.QN).ssf()
+    else:
+      target_name = 'list'
+    pop_var_name = self.context.namer.new_symbol(target_name, scope.referenced)
+
+    pop_uses = self.get_local(POP_USES, [])
+    pop_uses.append((node, pop_var_name))
+    self.set_local(POP_USES, pop_uses)
+
+    return templates.replace_as_expression('var_name', var_name=pop_var_name)
+
+  def _replace_stack_call(self, node):
+    assert len(node.args) == 1
+    dtype = anno.getanno(
+        node.args[0],
+        'element_type',
+        default=templates.replace_as_expression('None'))
+    template = """
+      ag__.list_stack(
+          target,
+          opts=ag__.ListStackOpts(
+              element_dtype=dtype,
+              original_call=orig_call))
+    """
+    return templates.replace_as_expression(
+        template,
+        dtype=dtype,
+        target=node.args[0],
+        orig_call=node.func)
 
-  def visit_Expr(self, node):
+  def visit_Call(self, node):
     node = self.generic_visit(node)
-    if isinstance(node.value, gast.Call):
-      call_node = node.value
-
-      if not anno.hasanno(call_node.func, anno.Basic.QN):
-        return node
-      qn = anno.getanno(call_node.func, anno.Basic.QN)
-
-      if qn.qn[-1] == 'append' and (len(call_node.args) == 1):
-        template = """
-          target = ag__.utils.dynamic_list_append(target, element)
-        """
-        node = templates.replace(
-            template,
-            target=qn.parent.ast(),
-            element=call_node.args[0])
+
+    # TODO(mdan): This is insufficient if target is a function argument.
+    # In the case of function arguments, we need to add the list to the
+    # function's return value, because it is being modified.
+    # TODO(mdan): Checking just the name is brittle, can it be improved?
+    if isinstance(node.func, gast.Attribute):
+      func_name = node.func.attr
+      if func_name == 'append' and (len(node.args) == 1):
+        node = self._replace_append_call(node)
+      elif func_name == 'pop' and (len(node.args) <= 1):
+        node = self._replace_pop_call(node)
+      elif func_name == 'stack' and (len(node.args) == 1):
+        node = self._replace_stack_call(node)
+
     return node
 
-  def _replace_list_constructors(self, targets, values):
-    for target in targets:
-      if (isinstance(target, (gast.Tuple, gast.List)) and
-          isinstance(values, (gast.Tuple, gast.List))):
-        n_targets = len(target.elts)
-        for i in range(n_targets):
-          target_el, value_el = target.elts[i], values.elts[i]
-          values.elts[i] = self._replace_list_constructors(
-              (target_el,), value_el)
-        return values
-      if isinstance(values, gast.List):
-        if values.elts:
-          return self._pre_populated_list(values)
-        else:
-          return self._empty_list(values)
-    return values
-
-  def visit_Assign(self, node):
-    node = self.generic_visit(node)
+  def _generate_pop_operation(self, original_call_node, pop_var_name):
+    assert isinstance(original_call_node.func, gast.Attribute)
+
+    if original_call_node.args:
+      pop_element = original_call_node.args[0]
+    else:
+      pop_element = parser.parse_expression('None')
+    # The call will be something like "target.pop()", and the dtype is hooked to
+    # target, hence the func.value.
+    dtype = anno.getanno(
+        original_call_node.func.value,
+        'element_type',
+        default=templates.replace_as_expression('None'))
+    shape = anno.getanno(
+        original_call_node.func.value,
+        'element_shape',
+        default=templates.replace_as_expression('None'))
+
+    template = """
+      target, pop_var_name = ag__.list_pop(
+          target, element,
+          opts=ag__.ListPopOpts(element_dtype=dtype, element_shape=shape))
+    """
+    return templates.replace(
+        template,
+        target=original_call_node.func.value,
+        pop_var_name=pop_var_name,
+        element=pop_element,
+        dtype=dtype,
+        shape=shape)
+
+  def _postprocess_statement(self, node):
+    """Inserts any separate pop() calls that node may use."""
+    pop_uses = self.get_local(POP_USES, None)
+    if pop_uses:
+      replacements = []
+      for original_call_node, pop_var_name in pop_uses:
+        replacements.extend(
+            self._generate_pop_operation(original_call_node, pop_var_name))
+      replacements.append(node)
+      node = replacements
+    self.exit_local_scope()
+    return node, None
+
+  # TODO(mdan): Should we have a generic visit_block instead?
+  # Right now it feels that a visit_block would add too much magic that's
+  # hard to follow.
+
+  def _visit_and_process_block(self, block):
+    return self.visit_block(
+        block,
+        before_visit=self.enter_local_scope,
+        after_visit=self._postprocess_statement)
+
+  def visit_FunctionDef(self, node):
+    node.args = self.generic_visit(node.args)
+    node.decorator_list = self.visit_block(node.decorator_list)
+    node.body = self._visit_and_process_block(node.body)
+    return node
+
+  def visit_For(self, node):
+    node.target = self.visit(node.target)
+    node.body = self._visit_and_process_block(node.body)
+    node.orelse = self._visit_and_process_block(node.orelse)
+    return node
+
+  def visit_While(self, node):
+    node.test = self.visit(node.test)
+    node.body = self._visit_and_process_block(node.body)
+    node.orelse = self._visit_and_process_block(node.orelse)
+    return node
+
+  def visit_If(self, node):
+    node.test = self.visit(node.test)
+    node.body = self._visit_and_process_block(node.body)
+    node.orelse = self._visit_and_process_block(node.orelse)
+    return node
 
-    # Only convert lists when they are assigned to a variable, e.g.:
-    #   l = []
-    # TODO(mdan): A similar pattern exists in type_info.py
-    # We should add a generic "unpack_assignment" function to the base
-    # transformer, that has the same effect as applying some logic to the SSA
-    # form.
-    node.value = self._replace_list_constructors(node.targets, node.value)
+  def visit_With(self, node):
+    node.items = self.visit_block(node.items)
+    node.body = self._visit_and_process_block(node.body)
     return node
 
 
diff --git a/tensorflow/contrib/autograph/converters/lists_test.py b/tensorflow/contrib/autograph/converters/lists_test.py
index 74c6dc64f1..9f18ab9f44 100644
--- a/tensorflow/contrib/autograph/converters/lists_test.py
+++ b/tensorflow/contrib/autograph/converters/lists_test.py
@@ -22,74 +22,126 @@ from tensorflow.contrib.autograph import utils
 from tensorflow.contrib.autograph.converters import converter_test_base
 from tensorflow.contrib.autograph.converters import lists
 from tensorflow.python.framework import dtypes
-from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.platform import test
 
 
 class ListTest(converter_test_base.TestCase):
 
-  def test_empty_annotated_list(self):
+  def test_empty_list(self):
 
     def test_fn():
-      l = []
-      utils.set_element_type(l, dtypes.int32)
-      l.append(1)
-      return l
+      return []
 
-    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = self.parse_and_analyze(test_fn, {})
     node = lists.transform(node, self.ctx)
 
-    with self.compiled(node, tensor_array_ops.TensorArray,
-                       dtypes.int32) as result:
-      # TODO(mdan): Attach these additional modules automatically.
-      result.utils = utils
-      result.dtypes = dtypes
+    with self.compiled(node) as result:
+      tl = result.test_fn()
+      # Empty tensor lists cannot be evaluated or stacked.
+      self.assertTrue(isinstance(tl, ops.Tensor))
+      self.assertEqual(tl.dtype, dtypes.variant)
+
+  def test_initialized_list(self):
+
+    def test_fn():
+      return [1, 2, 3]
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
       with self.test_session() as sess:
-        self.assertAllEqual([1], sess.run(result.test_fn().stack()))
+        tl = result.test_fn()
+        r = list_ops.tensor_list_stack(tl, dtypes.int32)
+        self.assertAllEqual(sess.run(r), [1, 2, 3])
 
-  def test_empty_annotated_lists_unpacked(self):
+  def test_list_append(self):
 
     def test_fn():
-      l, m = [], []
-      utils.set_element_type(l, dtypes.int32)
-      utils.set_element_type(m, dtypes.int32)
-      l.append(1)
-      m.append(2)
-      return l, m
+      l = [1]
+      l.append(2)
+      l.append(3)
+      return l
 
-    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+    node = self.parse_and_analyze(test_fn, {})
     node = lists.transform(node, self.ctx)
 
-    with self.compiled(node, tensor_array_ops.TensorArray,
-                       dtypes.int32) as result:
+    with self.compiled(node) as result:
+      with self.test_session() as sess:
+        tl = result.test_fn()
+        r = list_ops.tensor_list_stack(tl, dtypes.int32)
+        self.assertAllEqual(sess.run(r), [1, 2, 3])
+
+  def test_list_pop(self):
+
+    def test_fn():
+      l = [1, 2, 3]
+      utils.set_element_type(l, dtypes.int32, ())
+      s = l.pop()
+      return s, l
+
+    node = self.parse_and_analyze(
+        test_fn,
+        {
+            'utils': utils,
+            'dtypes': dtypes
+        },
+        include_type_analysis=True,
+    )
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
       result.utils = utils
       result.dtypes = dtypes
       with self.test_session() as sess:
-        res_l, res_m = result.test_fn()
-        self.assertEqual([1], sess.run(res_l.stack()))
-        self.assertEqual([2], sess.run(res_m.stack()))
+        ts, tl = result.test_fn()
+        r = list_ops.tensor_list_stack(tl, dtypes.int32)
+        self.assertAllEqual(sess.run(r), [1, 2])
+        self.assertAllEqual(sess.run(ts), 3)
+
+  def test_double_list_pop(self):
 
-  def test_empty_annotated_lists_list_unpacked(self):
+    def test_fn(l):
+      s = l.pop().pop()
+      return s
+
+    node = self.parse_and_analyze(test_fn, {})
+    node = lists.transform(node, self.ctx)
+
+    with self.compiled(node) as result:
+      test_input = [1, 2, [1, 2, 3]]
+      # TODO(mdan): Pass a list of lists of tensor when we fully support that.
+      # For now, we just pass a regular Python list of lists just to verify that
+      # the two pop calls are sequenced properly.
+      self.assertAllEqual(result.test_fn(test_input), 3)
+
+  def test_list_stack(self):
+
+    tf = None  # Will be replaced with a mock.
 
     def test_fn():
-      [l, m] = [], []
+      l = [1, 2, 3]
       utils.set_element_type(l, dtypes.int32)
-      utils.set_element_type(m, dtypes.int32)
-      l.append(1)
-      m.append(2)
-      return l, m
-
-    node = self.parse_and_analyze(test_fn, {'dtypes': dtypes, 'utils': utils})
+      return tf.stack(l)
+
+    node = self.parse_and_analyze(
+        test_fn,
+        {
+            'utils': utils,
+            'dtypes': dtypes
+        },
+        include_type_analysis=True,
+    )
     node = lists.transform(node, self.ctx)
 
-    with self.compiled(node, tensor_array_ops.TensorArray,
-                       dtypes.int32) as result:
+    with self.compiled(node, array_ops.stack, dtypes.int32) as result:
       result.utils = utils
       result.dtypes = dtypes
       with self.test_session() as sess:
-        res_l, res_m = result.test_fn()
-        self.assertEqual([1], sess.run(res_l.stack()))
-        self.assertEqual([2], sess.run(res_m.stack()))
+        self.assertAllEqual(sess.run(result.test_fn()), [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
index d6555dc7e0..7d1e65c958 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info.py
@@ -17,8 +17,8 @@
 This analyzer uses known live values to further infer object types. This
 may include for instance constructed objects and object member functions.
 
-In addition, the analyzer will also process annotations for TF (staged) type
-annotations.
+In addition, the analyzer also handles user annotations made in the code (for
+example, the autograph.set_element_type function).
 
 Requires annotations generated by LiveValuesResolver.
 """
@@ -44,6 +44,7 @@ from __future__ import print_function
 import gast
 
 from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import parser
 from tensorflow.contrib.autograph.pyct import transformer
 from tensorflow.python.util import tf_inspect
 
@@ -159,12 +160,10 @@ class TypeInfoResolver(transformer.Base):
       # a = b
       # then for future references to `a` we should have definition = `b`
       definition = self.scope.getval(qn)
-      if anno.hasanno(definition, 'type'):
-        anno.setanno(node, 'type', anno.getanno(definition, 'type'))
-        anno.setanno(node, 'type_fqn', anno.getanno(definition, 'type_fqn'))
-      if anno.hasanno(definition, 'element_type'):
-        anno.setanno(node, 'element_type',
-                     anno.getanno(definition, 'element_type'))
+      anno.copyanno(definition, node, 'type')
+      anno.copyanno(definition, node, 'type_fqn')
+      anno.copyanno(definition, node, 'element_type')
+      anno.copyanno(definition, node, 'element_shape')
     return node
 
   def _process_variable_assignment(self, target, value):
@@ -211,23 +210,20 @@ class TypeInfoResolver(transformer.Base):
       if (anno.getanno(node.func, 'live_val') is
           self.context.type_annotation_func):
 
-        if len(node.args) != 2:
-          raise ValueError('"%s" must have exactly two parameters'
+        if len(node.args) < 2 or len(node.args) > 3:
+          raise ValueError('"%s" must have either two or three parameters'
                            % self.context.type_annotation_func)
-        target_arg, type_arg = node.args
+        if len(node.args) == 2:
+          target_arg, type_arg = node.args
+          shape_arg = parser.parse_expression('None')
+        else:
+          target_arg, type_arg, shape_arg = node.args
         if not anno.hasanno(target_arg, anno.Basic.QN):
           raise ValueError('the first argument of "%s" must by a symbol'
                            % self.context.type_annotation_func)
-        if isinstance(type_arg, gast.Str):
-          element_type = type_arg.s
-        elif isinstance(type_arg, gast.Num):
-          element_type = type_arg.n
-        else:
-          if not anno.hasanno(type_arg, 'live_val'):
-            raise ValueError(
-                'the second argument of "%s" must be statically resolvable' %
-                self.context.type_annotation_func)
-          element_type = anno.getanno(type_arg, 'live_val')
+        # TODO(mdan): This is vulnerable to symbol renaming.
+        element_type = type_arg
+        element_shape = shape_arg
 
         target_symbol = anno.getanno(target_arg, anno.Basic.QN)
         # Find the definition of this symbol and annotate it with the given
@@ -235,7 +231,9 @@ class TypeInfoResolver(transformer.Base):
         # to receive the same type annotation.
         definition = self.scope.getval(target_symbol)
         anno.setanno(node, 'element_type', element_type)
+        anno.setanno(node, 'element_shape', element_shape)
         anno.setanno(definition, 'element_type', element_type)
+        anno.setanno(definition, 'element_shape', element_shape)
         # TODO(mdan): Should we update references between definition and here?
     return self.generic_visit(node)
 
diff --git a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
index 95cbf5ca79..484562f294 100644
--- a/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
+++ b/tensorflow/contrib/autograph/pyct/static_analysis/type_info_test.py
@@ -187,14 +187,14 @@ class TypeInfoResolverTest(test.TestCase):
 
     def test_fn():
       f = []
-      f = utils.set_element_type(f, Foo)
+      f = utils.set_element_type(f, Foo, (1, 2, 3))
       return f
 
     node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'utils': utils})
     f_def = node.body[0].body[0].value
-    self.assertEqual(anno.getanno(f_def, 'element_type'), Foo)
+    self.assertEqual(anno.getanno(f_def, 'element_type').id, 'Foo')
     f_ref = node.body[0].body[1].value
-    self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
+    self.assertEqual(anno.getanno(f_ref, 'element_type').id, 'Foo')
 
   def test_type_annotation_args(self):
 
@@ -207,7 +207,7 @@ class TypeInfoResolverTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'utils': utils})
     f_ref = node.body[0].body[1].value
-    self.assertEqual(anno.getanno(f_ref, 'element_type'), Foo)
+    self.assertEqual(anno.getanno(f_ref, 'element_type').id, 'Foo')
 
   def test_nested_unpacking(self):
 
@@ -223,9 +223,9 @@ class TypeInfoResolverTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn, {'Foo': Foo, 'Bar': Bar})
     a, b, c = node.body[0].body[1].value.elts
-    self.assertEquals(Foo, anno.getanno(a, 'type'))
-    self.assertEquals(Bar, anno.getanno(b, 'type'))
-    self.assertEquals(Foo, anno.getanno(c, 'type'))
+    self.assertEquals(anno.getanno(a, 'type'), Foo)
+    self.assertEquals(anno.getanno(b, 'type'), Bar)
+    self.assertEquals(anno.getanno(c, 'type'), Foo)
     self.assertFalse(anno.hasanno(a, 'live_val'))
     self.assertFalse(anno.hasanno(b, 'live_val'))
     self.assertFalse(anno.hasanno(c, 'live_val'))
@@ -242,8 +242,8 @@ class TypeInfoResolverTest(test.TestCase):
 
     node = self._parse_and_analyze(test_fn, {'utils': utils})
     a, b = node.body[0].body[2].body[2].value.elts
-    self.assertEquals(1, anno.getanno(a, 'element_type'))
-    self.assertEquals(2, anno.getanno(b, 'element_type'))
+    self.assertEquals(anno.getanno(a, 'element_type').n, 1)
+    self.assertEquals(anno.getanno(b, 'element_type').n, 2)
     self.assertFalse(anno.hasanno(a, 'type'))
     self.assertFalse(anno.hasanno(b, 'type'))
     self.assertFalse(anno.hasanno(a, 'live_val'))
-- 
GitLab


From 0d4274943a6bf6d461f5468b05162118934df6b3 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Fri, 8 Jun 2018 11:44:17 -0700
Subject: [PATCH 494/610] [TF:XLA] Bump open source llvm revision to r334273

PiperOrigin-RevId: 199826723
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index ce4a009974..4e2f26e097 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/7488dbc1218de926f3de0e9bb3d465f3bbe5b80e.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/7488dbc1218de926f3de0e9bb3d465f3bbe5b80e.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/42f7ad099aa73695ea633c585da0a9848d6a730d.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/42f7ad099aa73695ea633c585da0a9848d6a730d.tar.gz",
       ],
-      sha256 = "dd4a2e2a4f21ab69cf99534bcb2739c04fc12d12b63e5e3d8f2b85a2eb55d5d1",
-      strip_prefix = "llvm-7488dbc1218de926f3de0e9bb3d465f3bbe5b80e",
+      sha256 = "3a7f1f9c54b51640ba30e40e7e7698bca152e18510001b5a1ad70e8df45e1b05",
+      strip_prefix = "llvm-42f7ad099aa73695ea633c585da0a9848d6a730d",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From f21129b8afc083afbd53b4392762ed7b83205b47 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Fri, 8 Jun 2018 12:07:36 -0700
Subject: [PATCH 495/610] Improve tfdbg documentation regarding high-level APIs

* Mention both keras and tf.keras
* In one of the early paragraphs, list all three high-level APIs supported
  (tf.estimator, keras and tf.contrib.slim).

PiperOrigin-RevId: 199830255
---
 .../docs_src/programmers_guide/debugger.md    | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/debugger.md b/tensorflow/docs_src/programmers_guide/debugger.md
index 6bd941886d..fc845c68f4 100644
--- a/tensorflow/docs_src/programmers_guide/debugger.md
+++ b/tensorflow/docs_src/programmers_guide/debugger.md
@@ -33,8 +33,9 @@ and [`inf`s](https://en.wikipedia.org/wiki/Infinity), a frequently-encountered
 type of bug in TensorFlow model development.
 The following example is for users who use the low-level
 [`Session`](https://www.tensorflow.org/api_docs/python/tf/Session) API of
-TensorFlow. A later section of this document describes how to use **tfdbg**
-with a higher-level API, namely `Estimator`s.
+TensorFlow. Later sections of this document describe how to use **tfdbg**
+with higher-level APIs of TensorFlow, including `tf.estimator`,
+`tf.keras` / `keras` and `tf.contrib.slim`.
 To *observe* such an issue, run the following command without the debugger (the
 source code can be found
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/debug/examples/debug_mnist.py)):
@@ -477,20 +478,31 @@ for more details.
 
 ## Debugging Keras Models with TFDBG
 
-To use TFDBG with [Keras](https://keras.io/), let the Keras backend use
-a TFDBG-wrapped Session object. For example, to use the CLI wrapper:
+To use TFDBG with
+[tf.keras](https://www.tensorflow.org/api_docs/python/tf/keras),
+let the Keras backend use a TFDBG-wrapped Session object. For example, to use
+the CLI wrapper:
 
 ``` python
 import tensorflow as tf
-from keras import backend as keras_backend
 from tensorflow.python import debug as tf_debug
 
-keras_backend.set_session(tf_debug.LocalCLIDebugWrapperSession(tf.Session()))
+tf.keras.backend.set_session(tf_debug.LocalCLIDebugWrapperSession(tf.Session()))
 
 # Define your keras model, called "model".
-model.fit(...)  # This will break into the TFDBG CLI.
+
+# Calls to `fit()`, 'evaluate()` and `predict()` methods will break into the
+# TFDBG CLI.
+model.fit(...)
+model.evaluate(...)
+model.predict(...)
 ```
 
+With minor modification, the preceding code example also works for the
+[non-TensorFlow version of Keras](https://keras.io/) running against a
+TensorFlow backend. You just need to replace `tf.keras.backend` with
+`keras.backend`.
+
 ## Debugging tf-slim with TFDBG
 
 TFDBG supports debugging of training and evaluation with
-- 
GitLab


From 9f29e81349e15118847cdaf4029bb76760cf3543 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Fri, 8 Jun 2018 12:31:49 -0700
Subject: [PATCH 496/610] Fix: Keras models using datasets in eager mode fail
 on float64 data

PiperOrigin-RevId: 199833632
---
 tensorflow/python/keras/engine/training.py    | 11 ++-
 .../python/keras/engine/training_eager.py     | 15 +++-
 .../python/keras/engine/training_test.py      | 70 +++++++++++--------
 .../python/keras/engine/training_utils.py     | 30 ++++++++
 4 files changed, 93 insertions(+), 33 deletions(-)

diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index 04a2aa7664..89c1f1a40f 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -1008,14 +1008,16 @@ class Model(Network):
     # to keep track of number of inputs and outputs and their ndim.
     if isinstance(inputs, (list, tuple)):
       if tensor_util.is_tensor(inputs[0]):
-        dummy_output_values = self.call(inputs)
+        dummy_output_values = self.call(
+            training_utils.cast_if_floating_dtype(inputs))
       else:
         dummy_output_values = self.call(
             [ops.convert_to_tensor(v, dtype=K.floatx()) for v in inputs])
       dummy_input_values = list(inputs)
     else:
       if tensor_util.is_tensor(inputs):
-        dummy_output_values = self.call(inputs)
+        dummy_output_values = self.call(
+            training_utils.cast_if_floating_dtype(inputs))
       else:
         dummy_output_values = self.call(
             ops.convert_to_tensor(inputs, dtype=K.floatx()))
@@ -1616,7 +1618,10 @@ class Model(Network):
     # Validate and standardize user data.
     inputs, _, _ = self._standardize_user_data(x)
     if context.executing_eagerly():
-      if not isinstance(inputs, iterator_ops.EagerIterator):
+      if (isinstance(x, iterator_ops.EagerIterator) or
+          (isinstance(x, dataset_ops.Dataset) and context.executing_eagerly())):
+        inputs = training_utils.cast_if_floating_dtype(inputs)
+      else:
         inputs = [
             ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
         ]
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 15a7b0c0f2..2ecbff3a1c 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -255,6 +255,8 @@ def iterator_fit_loop(model,
     # Validate and standardize data.
     x, y, sample_weights = model._standardize_user_data(
         x, y, class_weight=class_weight)
+    x = training_utils.cast_if_floating_dtype(x)
+    y = training_utils.cast_if_floating_dtype(y)
     if sample_weights:
       sample_weights = [
           ops.convert_to_tensor(val, dtype=backend.floatx())
@@ -471,6 +473,8 @@ def iterator_test_loop(model, inputs, steps, verbose=0):
 
     # Validate and standardize data.
     x, y, sample_weights = model._standardize_user_data(x, y)
+    x = training_utils.cast_if_floating_dtype(x)
+    y = training_utils.cast_if_floating_dtype(y)
 
     # Calculate model output, loss values.
     loss_outs, loss, loss_metrics = _model_loss(
@@ -639,6 +643,7 @@ def iterator_predict_loop(model, inputs, steps, verbose=0):
 
     # Validate and standardize data.
     x, _, _ = model._standardize_user_data(x)
+    x = training_utils.cast_if_floating_dtype(x)
 
     if model._expects_training_arg:
       batch_outs = model.call(x[0] if len(x) == 1 else x, training=False)
@@ -814,7 +819,10 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss and the loss associated with each output.
   """
-  if len(inputs) and not tensor_util.is_tensor(inputs[0]):
+  if len(inputs) and tensor_util.is_tensor(inputs[0]):
+    inputs = training_utils.cast_if_floating_dtype(inputs)
+    targets = training_utils.cast_if_floating_dtype(targets)
+  else:
     inputs = [
         ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
     ]
@@ -849,7 +857,10 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss, loss and metrics associated with each output.
   """
-  if len(inputs) and not tensor_util.is_tensor(inputs[0]):
+  if len(inputs) and tensor_util.is_tensor(inputs[0]):
+    inputs = training_utils.cast_if_floating_dtype(inputs)
+    targets = training_utils.cast_if_floating_dtype(targets)
+  else:
     inputs = [
         ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
     ]
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 5c02d36382..a1ab720189 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -129,8 +129,10 @@ class TrainingTest(test.TestCase):
           {
               'input_a': input_a_np,
               'input_b': input_b_np
-          }, {'dense': output_d_np,
-              'dropout': output_e_np},
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
           epochs=1,
           batch_size=5,
           verbose=0)
@@ -138,8 +140,10 @@ class TrainingTest(test.TestCase):
           {
               'input_a': input_a_np,
               'input_b': input_b_np
-          }, {'dense': output_d_np,
-              'dropout': output_e_np},
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
           epochs=1,
           batch_size=5,
           verbose=1)
@@ -147,8 +151,10 @@ class TrainingTest(test.TestCase):
           {
               'input_a': input_a_np,
               'input_b': input_b_np
-          }, {'dense': output_d_np,
-              'dropout': output_e_np},
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
           validation_data=({
               'input_a': input_a_np,
               'input_b': input_b_np
@@ -162,8 +168,10 @@ class TrainingTest(test.TestCase):
       model.train_on_batch({
           'input_a': input_a_np,
           'input_b': input_b_np
-      }, {'dense': output_d_np,
-          'dropout': output_e_np})
+      }, {
+          'dense': output_d_np,
+          'dropout': output_e_np
+      })
 
       # Test with lists for loss, metrics
       loss = ['mae', 'mse']
@@ -285,16 +293,20 @@ class TrainingTest(test.TestCase):
           {
               'input_a': input_a_np,
               'input_b': input_b_np
-          }, {'dense': output_d_np,
-              'dropout': output_e_np},
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
           batch_size=5,
           verbose=0)
       model.evaluate(
           {
               'input_a': input_a_np,
               'input_b': input_b_np
-          }, {'dense': output_d_np,
-              'dropout': output_e_np},
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
           batch_size=5,
           verbose=1)
 
@@ -349,9 +361,11 @@ class TrainingTest(test.TestCase):
 
     with self.test_session():
       test_inputs = [
-          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)]
+          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
+      ]
       test_outputs = [
-          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)]
+          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
+      ]
       in1 = keras.layers.Input(shape=(3,))
       in2 = keras.layers.Input(shape=(3,))
       out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
@@ -1721,8 +1735,8 @@ class TestTrainingWithDatasetIterators(test.TestCase):
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
@@ -1786,8 +1800,8 @@ class TestTrainingWithDatasetIterators(test.TestCase):
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
@@ -1811,8 +1825,8 @@ class TestTrainingWithDatasetIterators(test.TestCase):
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(2)
       dataset = dataset.batch(10)
@@ -1838,8 +1852,8 @@ class TestTrainingWithDataset(test.TestCase):
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
@@ -1865,8 +1879,8 @@ class TestTrainingWithDataset(test.TestCase):
       metrics = ['mae']
       model.compile(optimizer, loss, metrics=metrics)
 
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
@@ -1928,8 +1942,8 @@ class TestTrainingWithDataset(test.TestCase):
       model.compile(optimizer, loss)
 
       # User forgets to batch the dataset
-      inputs = np.zeros((10, 3), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
 
@@ -1938,8 +1952,8 @@ class TestTrainingWithDataset(test.TestCase):
         model.train_on_batch(dataset)
 
       # Wrong input shape
-      inputs = np.zeros((10, 5), dtype=np.float32)
-      targets = np.zeros((10, 4), dtype=np.float32)
+      inputs = np.zeros((10, 5))
+      targets = np.zeros((10, 4))
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index b93f999444..728a2b493b 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -553,6 +553,10 @@ def standardize_weights(y,
 def has_symbolic_tensors(ls):
   if context.executing_eagerly():
     return False
+  return has_tensors(ls)
+
+
+def has_tensors(ls):
   if isinstance(ls, (list, tuple)):
     return any(tensor_util.is_tensor(v) for v in ls)
   return tensor_util.is_tensor(ls)
@@ -692,3 +696,29 @@ def check_steps_argument(input_data, steps, steps_name):
                            input_type=input_type_str, steps_name=steps_name))
     return True
   return False
+
+
+def cast_if_floating_dtype(x):
+  """Casts the given data tensors to the default floating point type.
+
+  Casts only if the input is already a floating point type.
+  Args:
+    x: tensor or list/tuple of tensors.
+
+  Returns:
+    Converted input.
+
+  Raises:
+    RuntimeError: if data isn't tensors.
+  """
+  if not has_tensors(x):
+    raise RuntimeError(
+        'Please provide tensors for casting, got: {x}'.format(x=x))
+
+  if isinstance(x, (list, tuple)):
+    return [
+        math_ops.cast(val, dtype=K.floatx())
+        if tensor_util.is_tensor(val) and val.dtype.is_floating else val
+        for val in x
+    ]
+  return math_ops.cast(x, dtype=K.floatx()) if x.dtype.is_floating else x
-- 
GitLab


From 503b7c11b44ee8b238946b345efea503058652c0 Mon Sep 17 00:00:00 2001
From: Nishidha <nishidha@us.ibm.com>
Date: Sat, 9 Jun 2018 01:07:06 +0530
Subject: [PATCH 497/610] Skipped the check that fails due to overflow error as
 float128 datatype is same as float64 instead of longdouble on platforms like
 Power - Issue# 19694 (#19860)

---
 .../bijectors/sinh_arcsinh_bijector_test.py   | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
index 45760a29ee..795f1993ba 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/bijectors/sinh_arcsinh_bijector_test.py
@@ -151,16 +151,24 @@ class SinhArcsinhBijectorTest(test.TestCase):
         self.assertAllClose(y, bijector.forward(x).eval(), rtol=1e-4, atol=0.)
         self.assertAllClose(x, bijector.inverse(y).eval(), rtol=1e-4, atol=0.)
 
-        # Do the numpy calculation in float128 to avoid inf/nan.
-        y_float128 = np.float128(y)
-        self.assertAllClose(
-            np.log(np.cosh(
-                np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
-                    y_float128**2 + 1)) -
-            np.log(tailweight),
-            bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
-            rtol=1e-4,
-            atol=0.)
+        # On IBM PPC systems, longdouble (np.float128) is same as double except that it can have more precision.
+        # Type double being of 8 bytes, can't hold square of max of float64 (which is also 8 bytes) and
+        # below test fails due to overflow error giving inf. So this check avoids that error by skipping square
+        # calculation and corresponding assert.
+
+        if np.amax(y) <= np.sqrt(np.finfo(np.float128).max) and \
+           np.fabs(np.amin(y)) <= np.sqrt(np.fabs(np.finfo(np.float128).min)):
+
+          # Do the numpy calculation in float128 to avoid inf/nan.
+          y_float128 = np.float128(y)
+          self.assertAllClose(
+              np.log(np.cosh(
+                  np.arcsinh(y_float128) / tailweight - skewness) / np.sqrt(
+                      y_float128**2 + 1)) -
+              np.log(tailweight),
+              bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
+              rtol=1e-4,
+              atol=0.)
         self.assertAllClose(
             -bijector.inverse_log_det_jacobian(y, event_ndims=0).eval(),
             bijector.forward_log_det_jacobian(x, event_ndims=0).eval(),
-- 
GitLab


From 055a0af39189924c52b12e875e7694e6c99a25d0 Mon Sep 17 00:00:00 2001
From: Pavithra Vijay <psv@google.com>
Date: Fri, 8 Jun 2018 12:34:43 -0700
Subject: [PATCH 498/610] Fix: Add back test case to test generator methods.

PiperOrigin-RevId: 199834091
---
 .../python/keras/engine/training_eager_test.py | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 7906d208eb..1571a7782a 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -403,6 +403,24 @@ class TrainingTest(test.TestCase):
     model.train_on_batch(inputs, targets)
     model.test_on_batch(inputs, targets)
 
+  def test_generator_methods(self):
+    model = keras.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(3,)))
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(optimizer, 'mse', metrics=['mae'])
+
+    x = np.random.random((10, 3))
+    y = np.random.random((10, 4))
+
+    def iterator():
+      while True:
+        yield x, y
+
+    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1)
+    model.evaluate_generator(iterator(), steps=3)
+    out = model.predict_generator(iterator(), steps=3)
+    self.assertEqual(out.shape, (30, 4))
+
 
 class LossWeightingTest(test.TestCase):
 
-- 
GitLab


From a6a265b61a9ad9510f45cf4c9032778bf2e042b9 Mon Sep 17 00:00:00 2001
From: SRIRAM VETURI <sriram.tutu@gmail.com>
Date: Fri, 8 Jun 2018 14:38:48 -0500
Subject: [PATCH 499/610] Added the tutorials link (#19844)

The very first time users would like to have a clear navigation where they can find the tutorials regarding the additional resources where they can learn the specific tasks in TensorFlow.
---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 6fb4486d0d..63853137cf 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ $ python
 42
 >>> sess.close()
 ```
+Learn more examples about how to do specific tasks in TensorFlow at the [tutorials page of tensorflow.org](https://www.tensorflow.org/tutorials/).
 
 ## Contribution guidelines
 
-- 
GitLab


From 5b540fe049fbb675eb1b5ea7d03fb4cb96a642c4 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 8 Jun 2018 12:36:55 -0700
Subject: [PATCH 500/610] [tf.data] Adding optimization for rewriting
 `shuffle(...).repeat(...)` to `shuffle_and_repeat(...)`.

PiperOrigin-RevId: 199834400
---
 .../core/grappler/optimizers/data/BUILD       |  35 ++++
 .../grappler/optimizers/data/graph_utils.cc   |  17 +-
 .../grappler/optimizers/data/graph_utils.h    |   4 +
 .../optimizers/data/graph_utils_test.cc       |  15 ++
 .../optimizers/data/map_and_batch_fusion.cc   |  20 ++-
 .../optimizers/data/map_and_batch_fusion.h    |   8 +-
 .../data/map_and_batch_fusion_test.cc         |  23 ++-
 .../data/shuffle_and_repeat_fusion.cc         | 112 +++++++++++++
 .../data/shuffle_and_repeat_fusion.h          |  46 ++++++
 .../data/shuffle_and_repeat_fusion_test.cc    | 149 ++++++++++++++++++
 10 files changed, 410 insertions(+), 19 deletions(-)
 create mode 100644 tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
 create mode 100644 tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
 create mode 100644 tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc

diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 121de1e089..08fc9d84da 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -67,11 +67,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "shuffle_and_repeat_fusion",
+    srcs = ["shuffle_and_repeat_fusion.cc"],
+    hdrs = [
+        "shuffle_and_repeat_fusion.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/grappler:graph_view",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+    ] + tf_protos_all(),
+)
+
+tf_cc_test(
+    name = "shuffle_and_repeat_fusion_test",
+    srcs = ["shuffle_and_repeat_fusion_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":graph_utils",
+        ":shuffle_and_repeat_fusion",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/grappler:grappler_item",
+    ],
+)
+
 cc_library(
     name = "data",
     visibility = ["//visibility:public"],
     deps = [
         ":map_and_batch_fusion",
+        ":shuffle_and_repeat_fusion",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index df12de37da..aece142f7a 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -28,6 +28,8 @@ namespace grappler {
 namespace graph_utils {
 namespace {
 
+constexpr char kConstOpName[] = "Const";
+
 int FindNodeWithPredicate(const std::function<bool(const NodeDef&)>& predicate,
                           const GraphDef& graph) {
   for (int i = 0; i < graph.node_size(); ++i) {
@@ -68,9 +70,8 @@ Status AddScalarConstNodeHelper(
     DataType dtype, const std::function<void(TensorProto*)>& add_value,
     GraphDef* graph, NodeDef** result) {
   NodeDef* node = graph->add_node();
-  const string& name = strings::StrCat("Const/_", graph->node_size());
-  node->set_name(name);
-  node->set_op("Const");
+  node->set_op(kConstOpName);
+  SetUniqueName(kConstOpName, graph, node);
   (*node->mutable_attr())["dtype"].set_type(dtype);
   std::unique_ptr<tensorflow::TensorProto> tensor =
       tensorflow::MakeUnique<tensorflow::TensorProto>();
@@ -94,7 +95,7 @@ Status AddNode(const string& name, const string& op,
   if (!name.empty()) {
     node->set_name(name);
   } else {
-    node->set_name(strings::StrCat(op, "/_", graph->node_size()));
+    SetUniqueName(op, graph, node);
   }
   node->set_op(op);
   for (const string& input : inputs) {
@@ -212,6 +213,14 @@ int FindNodeWithOp(const string& op, const GraphDef& graph) {
       [op](const NodeDef& node) { return node.op() == op; }, graph);
 }
 
+void SetUniqueName(const string& op, GraphDef* graph, NodeDef* node) {
+  int id = graph->node_size();
+  while (ContainsNodeWithName(strings::StrCat(op, "/_", id), *graph)) {
+    ++id;
+  }
+  node->set_name(strings::StrCat(op, "/_", id));
+}
+
 }  // end namespace graph_utils
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.h b/tensorflow/core/grappler/optimizers/data/graph_utils.h
index b40ca44d78..3d2467031f 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.h
@@ -74,6 +74,10 @@ int FindNodeWithName(const string& name, const GraphDef& graph);
 // exists.
 int FindNodeWithOp(const string& op, const GraphDef& graph);
 
+// Sets the node name using the op name as a prefix while guaranteeing the name
+// is unique across the graph.
+void SetUniqueName(const string& op, GraphDef* graph, NodeDef* node);
+
 }  // end namespace graph_utils
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
index b34726044e..00f66c9bc1 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils_test.cc
@@ -136,6 +136,21 @@ TEST_F(GraphUtilsTest, FindNodeWithOp) {
   EXPECT_EQ(FindNodeWithOp("OpA", graph), -1);
 }
 
+TEST_F(GraphUtilsTest, SetUniqueName) {
+  GraphDef graph;
+
+  NodeDef* node1;
+  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node1));
+  NodeDef* node2;
+  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node2));
+  EXPECT_NE(node1->name(), node2->name());
+
+  TF_EXPECT_OK(DeleteNodes({node1->name()}, &graph));
+  NodeDef* node3;
+  TF_EXPECT_OK(AddNode("", "A", {}, {}, &graph, &node3));
+  EXPECT_NE(node2->name(), node3->name());
+}
+
 }  // namespace
 }  // namespace graph_utils
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 290326ab75..a28b21224e 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -28,6 +28,11 @@ limitations under the License.
 
 namespace tensorflow {
 namespace grappler {
+namespace {
+
+constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
+
+}  // namespace
 
 Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
                                    GraphDef* output) {
@@ -39,21 +44,20 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
       continue;
     }
 
-    // Use a more descriptive variable name now that we now the node type.
-    NodeDef batch_node(node);
+    // Use a more descriptive variable name now that we know the node type.
+    const NodeDef batch_node(node);
     GraphView::InputPort input_port = graph.GetInputPort(batch_node.name(), 0);
     NodeDef* node2 = graph.GetRegularFanin(input_port).node;
     if (node2->op() != "MapDataset" && node2->op() != "ParallelMapDataset") {
       continue;
     }
 
-    // Use a more descriptive variable name now that we now the node type.
-    NodeDef* map_node = node2;
-    NodeDef* new_node = output->mutable_node()->Add();
-    new_node->set_op("MapAndBatchDatasetV2");
-    new_node->set_name(
-        strings::StrCat("MapAndBatchDatasetV2/_", output->node_size()));
+    NodeDef* new_node = output->add_node();
+    new_node->set_op(kFusedOpName);
+    graph_utils::SetUniqueName(kFusedOpName, output, new_node);
 
+    // Use a more descriptive variable name now that we know the node type.
+    NodeDef* map_node = node2;
     // Set the `input` input argument.
     new_node->add_input(map_node->input(0));
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index a5a4d91df6..2c64831105 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -23,13 +23,13 @@ namespace grappler {
 
 class MapAndBatchFusion : public CustomGraphOptimizer {
  public:
-  MapAndBatchFusion() {}
-  ~MapAndBatchFusion() override {}
+  MapAndBatchFusion() = default;
+  ~MapAndBatchFusion() override = default;
 
   string name() const override { return "map_and_batch_fusion"; };
 
-  Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
-                  nullptr) override {
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     return Status::OK();
   }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 8c7498dc5d..76d2f5d537 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -204,10 +204,9 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
 }
 
 TEST(MapAndBatchFusionTest, NoChange) {
-  std::vector<std::pair<string, AttrValue>> empty_attributes;
-
   GrapplerItem item;
   GraphDef *graph = &item.graph;
+
   NodeDef *start_node;
   TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
   NodeDef *stop_node;
@@ -219,9 +218,27 @@ TEST(MapAndBatchFusionTest, NoChange) {
   range_inputs[0] = start_node->name();
   range_inputs[1] = stop_node->name();
   range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
   NodeDef *range_node;
   TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
-                                    empty_attributes, graph, &range_node));
+                                    range_attrs, graph, &range_node));
+
+  NodeDef *batch_size_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  std::vector<string> batch_inputs(2);
+  batch_inputs[0] = range_node->name();
+  batch_inputs[1] = batch_size_node->name();
+  std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue("output_shapes", &shapes_attr);
+  batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue("output_types", &types_attr);
+  batch_attrs[1] = std::make_pair("output_types", types_attr);
+  NodeDef *batch_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "BatchDataset", batch_inputs,
+                                    batch_attrs, graph, &batch_node));
 
   MapAndBatchFusion optimizer;
   GraphDef output;
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
new file mode 100644
index 0000000000..0df73b33ed
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h"
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/graph_view.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/op_types.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kFusedOpName[] = "ShuffleAndRepeatDataset";
+
+}  // namespace
+
+Status ShuffleAndRepeatFusion::Optimize(Cluster* cluster,
+                                        const GrapplerItem& item,
+                                        GraphDef* output) {
+  *output = item.graph;
+  GraphView graph(output);
+  std::set<string> nodes_to_delete;
+  for (const NodeDef& node : item.graph.node()) {
+    if (node.op() != "RepeatDataset") {
+      continue;
+    }
+
+    // Use a more descriptive variable name now that we know the node type.
+    const NodeDef repeat_node(node);
+    GraphView::InputPort input_port = graph.GetInputPort(repeat_node.name(), 0);
+    NodeDef* node2 = graph.GetRegularFanin(input_port).node;
+    if (node2->op() != "ShuffleDataset") {
+      continue;
+    }
+
+    NodeDef* new_node = output->add_node();
+    new_node->set_op(kFusedOpName);
+    graph_utils::SetUniqueName(kFusedOpName, output, new_node);
+
+    // Use a more descriptive variable name now that we know the node type.
+    NodeDef* shuffle_node = node2;
+
+    // Set the `input` input argument.
+    new_node->add_input(shuffle_node->input(0));
+
+    // Set the `buffer_size` input argument.
+    new_node->add_input(shuffle_node->input(1));
+
+    // Set the `seed` input argument.
+    new_node->add_input(shuffle_node->input(2));
+
+    // Set the `seed2` input argument.
+    new_node->add_input(shuffle_node->input(3));
+
+    // Set the `count` input argument.
+    new_node->add_input(repeat_node.input(1));
+
+    // Set `output_types` and `output_shapes` attributes.
+    for (auto key : {"output_shapes", "output_types"}) {
+      (*new_node->mutable_attr())[key] = repeat_node.attr().at(key);
+    }
+
+    // Mark the `Shuffle` and `Repeat` nodes for removal.
+    nodes_to_delete.insert(shuffle_node->name());
+    nodes_to_delete.insert(repeat_node.name());
+
+    // Update the input of the outputs of the `Repeat` node to use
+    // `ShuffleAndRepeat`.
+    GraphView::OutputPort output_port =
+        graph.GetOutputPort(repeat_node.name(), 0);
+    auto fanout = graph.GetFanout(output_port);
+    for (auto it = fanout.begin(); it != fanout.end(); ++it) {
+      NodeDef* node = it->node;
+      node->set_input(0, new_node->name());
+    }
+  }
+  TF_RETURN_IF_ERROR(graph_utils::DeleteNodes(nodes_to_delete, output));
+  return Status::OK();
+}
+
+void ShuffleAndRepeatFusion::Feedback(Cluster* cluster,
+                                      const GrapplerItem& item,
+                                      const GraphDef& optimize_output,
+                                      double result) {
+  // no-op
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(ShuffleAndRepeatFusion,
+                            "shuffle_and_repeat_fusion");
+
+}  // end namespace grappler
+}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
new file mode 100644
index 0000000000..c8fa53edce
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
@@ -0,0 +1,46 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
+
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer.h"
+
+namespace tensorflow {
+namespace grappler {
+
+class ShuffleAndRepeatFusion : public CustomGraphOptimizer {
+ public:
+  ShuffleAndRepeatFusion() = default;
+  ~ShuffleAndRepeatFusion() override = default;
+
+  string name() const override { return "shuffle_and_repeat_fusion"; };
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return Status::OK();
+  }
+
+  Status Optimize(Cluster* cluster, const GrapplerItem& item,
+                  GraphDef* output) override;
+
+  void Feedback(Cluster* cluster, const GrapplerItem& item,
+                const GraphDef& optimize_output, double result) override;
+};
+
+}  // end namespace grappler
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SHUFFLE_AND_REPEAT_FUSION_H_
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
new file mode 100644
index 0000000000..e89675efb7
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h"
+
+#include "tensorflow/core/framework/attr_value_util.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+TEST(ShuffleAndRepeatFusionTest, FuseShuffleAndRepeatNodesIntoOne) {
+  GrapplerItem item;
+  GraphDef *graph = &item.graph;
+
+  std::vector<std::pair<string, AttrValue>> common_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue("output_shapes", &shapes_attr);
+  common_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue("output_types", &types_attr);
+  common_attrs[1] = std::make_pair("output_types", types_attr);
+
+  NodeDef *start_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
+  NodeDef *stop_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
+  NodeDef *step_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                    common_attrs, graph, &range_node));
+
+  NodeDef *buffer_size_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<int64>(128, graph, &buffer_size_node));
+  NodeDef *seed_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &seed_node));
+  NodeDef *seed2_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &seed2_node));
+  std::vector<string> shuffle_inputs(4);
+  shuffle_inputs[0] = range_node->name();
+  shuffle_inputs[1] = buffer_size_node->name();
+  shuffle_inputs[2] = seed_node->name();
+  shuffle_inputs[3] = seed2_node->name();
+  NodeDef *shuffle_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "ShuffleDataset", shuffle_inputs,
+                                    common_attrs, graph, &shuffle_node));
+
+  NodeDef *count_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &count_node));
+  std::vector<string> repeat_inputs(2);
+  repeat_inputs[0] = shuffle_node->name();
+  repeat_inputs[1] = count_node->name();
+  NodeDef *repeat_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RepeatDataset", repeat_inputs,
+                                    common_attrs, graph, &repeat_node));
+
+  ShuffleAndRepeatFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(shuffle_node->name(), output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(repeat_node->name(), output));
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp("ShuffleAndRepeatDataset", output));
+  NodeDef shuffle_and_repeat_node = output.node(
+      graph_utils::FindNodeWithOp("ShuffleAndRepeatDataset", output));
+  EXPECT_EQ(shuffle_and_repeat_node.input_size(), 5);
+  EXPECT_EQ(shuffle_and_repeat_node.input(0), shuffle_node->input(0));
+  EXPECT_EQ(shuffle_and_repeat_node.input(1), shuffle_node->input(1));
+  EXPECT_EQ(shuffle_and_repeat_node.input(2), shuffle_node->input(2));
+  EXPECT_EQ(shuffle_and_repeat_node.input(3), shuffle_node->input(3));
+  EXPECT_EQ(shuffle_and_repeat_node.input(4), repeat_node->input(1));
+  EXPECT_TRUE(
+      AreAttrValuesEqual(shuffle_and_repeat_node.attr().at("output_shapes"),
+                         repeat_node->attr().at("output_shapes")));
+  EXPECT_TRUE(
+      AreAttrValuesEqual(shuffle_and_repeat_node.attr().at("output_types"),
+                         repeat_node->attr().at("output_types")));
+}
+
+TEST(ShuffleAndRepeatFusionTest, NoChange) {
+  GrapplerItem item;
+  GraphDef *graph = &item.graph;
+
+  std::vector<std::pair<string, AttrValue>> common_attrs(2);
+  AttrValue shapes_attr;
+  SetAttrValue("output_shapes", &shapes_attr);
+  common_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+  AttrValue types_attr;
+  SetAttrValue("output_types", &types_attr);
+  common_attrs[1] = std::make_pair("output_types", types_attr);
+
+  NodeDef *start_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
+  NodeDef *stop_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
+  NodeDef *step_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  NodeDef *range_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                    common_attrs, graph, &range_node));
+
+  NodeDef *count_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(-1, graph, &count_node));
+  std::vector<string> repeat_inputs(2);
+  repeat_inputs[0] = range_node->name();
+  repeat_inputs[1] = count_node->name();
+  NodeDef *repeat_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RepeatDataset", repeat_inputs,
+                                    common_attrs, graph, &repeat_node));
+
+  ShuffleAndRepeatFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_TRUE(graph_utils::Compare(*graph, output));
+}
+
+}  // namespace
+}  // namespace grappler
+}  // namespace tensorflow
-- 
GitLab


From 7bb79ee219d4efbd92d1ef4e0dbe45f4aee26654 Mon Sep 17 00:00:00 2001
From: Eugene Zhulenev <ezhulenev@google.com>
Date: Fri, 8 Jun 2018 12:46:39 -0700
Subject: [PATCH 501/610] Ask NumPy for read only array when converting it to
 Tensor.

Fix for: #17315

If numpy array is read-only, calling PyArray_FromAny with NPY_ARRAY_CARRAY
flags introduce extra memory copy.

Before:
  feed_cpu_variable_read_only: 5.6 GB/sec, min: 17.99, median: 19.54, mean: 19.76

After:
  feed_cpu_variable_read_only: 13.2 GB/sec, min: 7.60, median: 7.78, mean: 8.13
PiperOrigin-RevId: 199835695
---
 tensorflow/python/lib/core/ndarray_tensor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/lib/core/ndarray_tensor.cc b/tensorflow/python/lib/core/ndarray_tensor.cc
index 2acab92764..ec1ba7b8f7 100644
--- a/tensorflow/python/lib/core/ndarray_tensor.cc
+++ b/tensorflow/python/lib/core/ndarray_tensor.cc
@@ -411,7 +411,7 @@ Status PyArrayToTF_Tensor(PyObject* ndarray, Safe_TF_TensorPtr* out_tensor) {
 
   // Make sure we dereference this array object in case of error, etc.
   Safe_PyObjectPtr array_safe(make_safe(
-      PyArray_FromAny(ndarray, nullptr, 0, 0, NPY_ARRAY_CARRAY, nullptr)));
+      PyArray_FromAny(ndarray, nullptr, 0, 0, NPY_ARRAY_CARRAY_RO, nullptr)));
   if (!array_safe) return errors::InvalidArgument("Not a ndarray.");
   PyArrayObject* array = reinterpret_cast<PyArrayObject*>(array_safe.get());
 
-- 
GitLab


From 278fbe4146b160980fec318187546d9d8870d244 Mon Sep 17 00:00:00 2001
From: Mark Heffernan <meheff@google.com>
Date: Fri, 8 Jun 2018 12:50:16 -0700
Subject: [PATCH 502/610] Add kGenerateToken HLO instruction. The new HLO
 instruction serves two purposes. (1) It generates a new token value. This is
 the only way to create tokens. (2) The operation is variadic, taking zero or
 more token operands. The operation acts as a join of its operands.

I considered initially using a kConstant constant as a method to create new tokens, but this ran into problems because of expectations in backends regarding constants and their materialization.

This CL enables creation of generate-token instructions, but the new instruction is not supported yet in any backend.

PiperOrigin-RevId: 199836205
---
 .../compiler/xla/service/dfs_hlo_visitor.h    |   2 +
 .../service/dfs_hlo_visitor_with_default.h    |   3 +
 .../compiler/xla/service/hlo_cost_analysis.cc |   4 +
 .../compiler/xla/service/hlo_cost_analysis.h  |   1 +
 .../compiler/xla/service/hlo_evaluator.cc     |   8 ++
 .../compiler/xla/service/hlo_evaluator.h      |   2 +
 .../compiler/xla/service/hlo_graph_dumper.cc  |   1 +
 .../compiler/xla/service/hlo_instruction.cc   |  17 +++
 .../compiler/xla/service/hlo_instruction.h    |   5 +
 tensorflow/compiler/xla/service/hlo_opcode.h  |   1 +
 .../compiler/xla/service/hlo_opcode_test.cc   |   1 +
 tensorflow/compiler/xla/service/hlo_parser.cc |   8 ++
 .../compiler/xla/service/hlo_verifier.cc      |  50 +++++++
 .../compiler/xla/service/hlo_verifier.h       |   1 +
 .../xla/service/instruction_fusion.cc         |   1 +
 .../compiler/xla/service/shape_inference.cc   |  11 ++
 .../compiler/xla/service/shape_inference.h    |   7 +
 tensorflow/compiler/xla/tests/BUILD           |  16 +++
 .../compiler/xla/tests/token_hlo_test.cc      | 124 ++++++++++++++++++
 19 files changed, 263 insertions(+)
 create mode 100644 tensorflow/compiler/xla/tests/token_hlo_test.cc

diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index 64678d9d74..ee2b455730 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -243,6 +243,8 @@ class DfsHloVisitorBase {
 
   virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
 
+  virtual Status HandleGenerateToken(HloInstructionPtr token) = 0;
+
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
   virtual Status FinishVisit(HloInstructionPtr root) = 0;
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index 240faebe62..6934e00a4b 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -188,6 +188,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleGather(HloInstructionPtr gather) override {
     return DefaultAction(gather);
   }
+  Status HandleGenerateToken(HloInstructionPtr token) override {
+    return DefaultAction(token);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index b9d30ee802..92a66681a9 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -387,6 +387,10 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
   return Status::OK();
 }
 
+Status HloCostAnalysis::HandleGenerateToken(const HloInstruction*) {
+  return Status::OK();
+}
+
 Status HloCostAnalysis::HandleConvolution(const HloInstruction* convolution) {
   auto lhs = convolution->operand(0);
   auto rhs = convolution->operand(1);
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index d17678d20f..0d66736fe1 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -97,6 +97,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleBroadcast(const HloInstruction* broadcast) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleGenerateToken(const HloInstruction* token) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
   Status HandleWhile(const HloInstruction* xla_while) override;
   Status HandleConditional(const HloInstruction* conditional) override;
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 1e78d775c8..e0648e1467 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -910,6 +910,14 @@ Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
+Status HloEvaluator::HandleGenerateToken(HloInstruction* token) {
+  // Literals cannot represent a TOKEN shape so just create an empty tuple as
+  // the "result" of the kGenerateToken operation.
+  // TODO(b/109929053): Add support for TOKENs in Literals.
+  evaluated_[token] = Literal::MakeTuple({});
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   const auto result_shape = get_tuple_element->shape();
   const int64 index = get_tuple_element->tuple_index();
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index b53d5644de..fc2fc9437b 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -174,6 +174,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
+  Status HandleGenerateToken(HloInstruction* token) override;
+
   // Returns the already-evaluated literal result for the instruction.
   // A Constant instruction is considered evaluated and its literal will be
   // returned directly without looking up the cache.
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index a6750460e5..cf954001c6 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -964,6 +964,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kBitcast:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kTrace:
+    case HloOpcode::kGenerateToken:
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index ae230d2740..a778a6a965 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -583,6 +583,17 @@ HloInstruction::CreateCrossReplicaSum(
   return MakeUnique<HloReverseInstruction>(shape, operand, dimensions);
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateGenerateToken(
+    tensorflow::gtl::ArraySlice<HloInstruction*> operands) {
+  auto instruction = WrapUnique(new HloInstruction(
+      HloOpcode::kGenerateToken, ShapeUtil::MakeTokenShape()));
+  for (auto operand : operands) {
+    instruction->AppendOperand(operand);
+  }
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateWhile(
     const Shape& shape, HloComputation* condition, HloComputation* body,
     HloInstruction* init) {
@@ -1512,6 +1523,9 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
           CreateDomain(shape, new_operands[0], operand_side_metadata_->Clone(),
                        user_side_metadata_->Clone());
       break;
+    case HloOpcode::kGenerateToken:
+      clone = CreateGenerateToken(new_operands);
+      break;
     case HloOpcode::kTrace:
       LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
@@ -1776,6 +1790,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kRng:
     case HloOpcode::kTrace:
     case HloOpcode::kWhile:
+    case HloOpcode::kGenerateToken:
       return false;
 
     case HloOpcode::kParameter:
@@ -2776,6 +2791,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleGather(this);
     case HloOpcode::kDomain:
       return visitor->HandleDomain(this);
+    case HloOpcode::kGenerateToken:
+      return visitor->HandleGenerateToken(this);
 
     // These opcodes are not handled here.
     case HloOpcode::kTrace:
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index cc4a8b8252..d252533eb2 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -664,6 +664,11 @@ class HloInstruction {
       const Shape& shape, HloInstruction* operand,
       tensorflow::gtl::ArraySlice<int64> dimensions);
 
+  // Creates a token instruction used for joining or creating token types which
+  // thread through side-effecting operations.
+  static std::unique_ptr<HloInstruction> CreateGenerateToken(
+      tensorflow::gtl::ArraySlice<HloInstruction*> operands);
+
   // Creates an instance of GatherDimensionNumbers.
   static GatherDimensionNumbers MakeGatherDimNumbers(
       tensorflow::gtl::ArraySlice<int64> output_window_dims,
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 1fe06ee0c0..a35546f5f4 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -81,6 +81,7 @@ namespace xla {
   V(kFusion, "fusion", kHloOpcodeIsVariadic)                 \
   V(kGather, "gather")                                       \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
+  V(kGenerateToken, "generate-token", kHloOpcodeIsVariadic)  \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
   V(kHostCompute, "host-compute")                            \
diff --git a/tensorflow/compiler/xla/service/hlo_opcode_test.cc b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
index cd2ce5c69f..774345124b 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_opcode_test.cc
@@ -58,6 +58,7 @@ TEST(HloOpcodeTest, OpcodeProperties) {
       case HloOpcode::kConcatenate:
       case HloOpcode::kFusion:
       case HloOpcode::kMap:
+      case HloOpcode::kGenerateToken:
       case HloOpcode::kTuple:
         EXPECT_TRUE(HloOpcodeIsVariadic(opcode));
         break;
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index a1bc269400..bf1c7b9323 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -606,6 +606,14 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
           HloInstruction::CreateReshape(shape, operands[0]));
       break;
     }
+    case HloOpcode::kGenerateToken: {
+      if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateGenerateToken(operands));
+      break;
+    }
     case HloOpcode::kTuple: {
       if (!ParseOperands(&operands) || !ParseAttributes(attrs)) {
         return false;
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 9cfd8a9bf7..9034073cc8 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -426,6 +426,14 @@ Status ShapeVerifier::HandleGather(HloInstruction* gather) {
           gather->gather_dimension_numbers(), gather->gather_window_bounds()));
 }
 
+Status ShapeVerifier::HandleGenerateToken(HloInstruction* token) {
+  std::vector<const Shape*> operand_shapes;
+  for (const HloInstruction* operand : token->operands()) {
+    operand_shapes.push_back(&operand->shape());
+  }
+  return CheckShape(token, ShapeInference::InferTokenShape(operand_shapes));
+}
+
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
                                  const Shape& inferred_shape) {
   // If allow_mixed_precision_ is false, check if there are operands with
@@ -791,6 +799,46 @@ Status HloVerifier::CheckElementwiseInstruction(HloInstruction* instruction) {
   return Status::OK();
 }
 
+namespace {
+
+// Returns true if the given Shape has a TOKEN shape as any subshape.
+bool ShapeContainsToken(const Shape& shape) {
+  bool contains_token = false;
+  ShapeUtil::ForEachSubshape(
+      shape, [&contains_token](const Shape& subshape, const ShapeIndex&) {
+        if (ShapeUtil::IsToken(subshape)) {
+          contains_token = true;
+        }
+      });
+  return contains_token;
+}
+
+// Verifies that all types entering and exiting the entry computation are
+// legal. For example, TOKEN types have no Literal representation and cannot be
+// on the interface of the entry computation (parameters and root instruction).
+Status VerifyEntryAndExitShapes(const HloModule& module) {
+  for (int i = 0; i < module.entry_computation()->num_parameters(); ++i) {
+    HloInstruction* param =
+        module.entry_computation()->parameter_instruction(i);
+    if (ShapeContainsToken(param->shape())) {
+      return InternalError(
+          "Entry parameter %d is or contains a token shape: %s", i,
+          ShapeUtil::HumanString(param->shape()).c_str());
+    }
+  }
+  if (ShapeContainsToken(
+          module.entry_computation()->root_instruction()->shape())) {
+    return InternalError(
+        "Entry root is or contains a token shape: %s",
+        ShapeUtil::HumanString(
+            module.entry_computation()->root_instruction()->shape())
+            .c_str());
+  }
+  return Status::OK();
+}
+
+}  // namespace
+
 StatusOr<bool> HloVerifier::Run(HloModule* module) {
   TF_RETURN_IF_ERROR(VerifyHloStructure(module));
 
@@ -851,6 +899,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
     TF_RETURN_IF_ERROR(computation->Accept(shape_verifier.get()));
   }
 
+  TF_RETURN_IF_ERROR(VerifyEntryAndExitShapes(*module));
+
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 1392a78097..7283b3e7dc 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -81,6 +81,7 @@ class ShapeVerifier : public DfsHloVisitor {
       HloInstruction* batch_norm_inference) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm_grad) override;
   Status HandleGather(HloInstruction* gather) override;
+  Status HandleGenerateToken(HloInstruction* token) override;
 
   Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 429c850343..abedb4063d 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -96,6 +96,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kShiftRightLogical:
     case HloOpcode::kSlice:
     case HloOpcode::kSubtract:
+    case HloOpcode::kGenerateToken:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
       return false;
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index d624f548b1..fdc7f41759 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -463,6 +463,17 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::MakeShape(element_type, new_dimensions);
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferTokenShape(
+    tensorflow::gtl::ArraySlice<const Shape*> arg_shapes) {
+  for (const Shape* arg_shape : arg_shapes) {
+    if (arg_shape->element_type() != TOKEN) {
+      return InvalidArgument(
+          "Operands of token instructions must be TOKEN types.");
+    }
+  }
+  return ShapeUtil::MakeTokenShape();
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
     const Shape& operand_shape, PrimitiveType new_element_type) {
   auto old_element_type = operand_shape.element_type();
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 9da2c99b41..6100e2cd33 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -227,6 +227,13 @@ class ShapeInference {
   static StatusOr<Shape> InferConcatOpShape(
       tensorflow::gtl::ArraySlice<const Shape*> arg_shapes, int64 dimension);
 
+  // Infers the shape produced by a kGenerateToken operation. Trivially this
+  // shape is always a TOKEN shape. However, ShapeInference serves two purposes:
+  // inferring shapes and checking operand shapes. This method verifies that the
+  // operand shapes are all TOKENs.
+  static StatusOr<Shape> InferTokenShape(
+      tensorflow::gtl::ArraySlice<const Shape*> arg_shapes);
+
   // Helper that validates the given operand shape can be converted to the
   // target output_shape via a convert instruction -- the requirement is that
   // the shape is identical except for the element type.
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 7f6bbe6f87..e7e0a19db0 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -1203,6 +1203,22 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "token_hlo_test",
+    srcs = ["token_hlo_test.cc"],
+    tags = [
+        "enable_for_xla_interpreter",
+    ],
+    deps = [
+        ":client_library_test_base",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
new file mode 100644
index 0000000000..4585244ce8
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <array>
+
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+class TokenHloTest : public HloTestBase {};
+
+// TODO(b/79770375): Compile, not just verify the HLO module when the backends
+// support kGenerateToken.
+XLA_TEST_F(TokenHloTest, SingleTokenInstruction) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(HloInstruction::CreateGenerateToken({}));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
+
+  module->AddEntryComputation(builder.Build());
+  EXPECT_IS_OK(HloVerifier().Run(module.get()).status());
+}
+
+XLA_TEST_F(TokenHloTest, TokenTree) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto token0 = builder.AddInstruction(HloInstruction::CreateGenerateToken({}));
+  auto token1 = builder.AddInstruction(HloInstruction::CreateGenerateToken({}));
+  auto token2 = builder.AddInstruction(HloInstruction::CreateGenerateToken({}));
+  builder.AddInstruction(
+      HloInstruction::CreateGenerateToken({token0, token0, token1, token2}));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
+
+  module->AddEntryComputation(builder.Build());
+  EXPECT_IS_OK(HloVerifier().Run(module.get()).status());
+}
+
+XLA_TEST_F(TokenHloTest, InvalidTokenShapedEntryParameter) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  builder.AddInstruction(
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeTokenShape(), "p1"));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(42)));
+  module->AddEntryComputation(builder.Build());
+
+  Status status = HloVerifier().Run(module.get()).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("Entry parameter 1 is or contains a token shape"));
+}
+
+XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      0,
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(F32, {1, 2, 3}), ShapeUtil::MakeTokenShape()}),
+      "param"));
+  module->AddEntryComputation(builder.Build());
+
+  Status status = HloVerifier().Run(module.get()).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(
+      status.error_message(),
+      ::testing::HasSubstr("Entry parameter 0 is or contains a token shape"));
+}
+
+XLA_TEST_F(TokenHloTest, InvalidTokenRoot) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  builder.AddInstruction(HloInstruction::CreateGenerateToken({}));
+  module->AddEntryComputation(builder.Build());
+
+  Status status = HloVerifier().Run(module.get()).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("Entry root is or contains a token shape"));
+}
+
+XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
+  std::unique_ptr<HloModule> module = CreateNewModule();
+  auto builder = HloComputation::Builder(TestName());
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
+  builder.AddInstruction(HloInstruction::CreateGenerateToken({param}));
+  builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<int32>(123)));
+  module->AddEntryComputation(builder.Build());
+
+  Status status = HloVerifier().Run(module.get()).status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr(
+                  "Operands of token instructions must be TOKEN types"));
+}
+
+}  // namespace
+}  // namespace xla
-- 
GitLab


From 9affc2080bf9840f4c7da2990ba528114e25d3b1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 12:51:11 -0700
Subject: [PATCH 503/610] Change gRPC include directory from "grpc++" to
 "grpcpp"

PiperOrigin-RevId: 199836336
---
 .../compiler/xla/rpc/grpc_client_test.cc       |  4 ++--
 tensorflow/compiler/xla/rpc/grpc_service.h     |  2 +-
 .../compiler/xla/rpc/grpc_service_main.cc      |  6 +++---
 .../tpu/profiler/capture_tpu_profile.cc        |  2 +-
 tensorflow/contrib/verbs/grpc_verbs_service.cc |  6 +++---
 .../contrib/verbs/grpc_verbs_service_impl.cc   | 16 ++++++++--------
 .../contrib/verbs/grpc_verbs_service_impl.h    | 16 ++++++++--------
 tensorflow/core/debug/debug_grpc_testlib.h     |  2 +-
 tensorflow/core/debug/debug_io_utils.cc        |  2 +-
 .../core/distributed_runtime/master_test.cc    |  2 +-
 .../rpc/eager/grpc_eager_client.cc             |  2 +-
 .../rpc/eager/grpc_eager_service.cc            | 16 ++++++++--------
 .../rpc/eager/grpc_eager_service.h             | 16 ++++++++--------
 .../rpc/eager/grpc_eager_service_impl.h        |  6 +++---
 .../core/distributed_runtime/rpc/grpc_call.h   |  6 +++---
 .../distributed_runtime/rpc/grpc_channel.cc    |  2 +-
 .../distributed_runtime/rpc/grpc_channel.h     |  2 +-
 .../rpc/grpc_client_cq_tag.h                   |  2 +-
 .../rpc/grpc_master_service.cc                 |  4 ++--
 .../rpc/grpc_master_service_impl.cc            | 16 ++++++++--------
 .../rpc/grpc_master_service_impl.h             | 16 ++++++++--------
 .../rpc/grpc_remote_worker.cc                  |  4 ++--
 .../distributed_runtime/rpc/grpc_server_lib.cc |  6 +++---
 .../distributed_runtime/rpc/grpc_server_lib.h  |  4 ++--
 .../core/distributed_runtime/rpc/grpc_state.h  |  4 ++--
 .../rpc/grpc_tensor_coding.cc                  |  4 ++--
 .../rpc/grpc_tensor_coding_test.cc             |  4 ++--
 .../rpc/grpc_tensorflow_server.cc              |  6 +++---
 .../rpc/grpc_testlib_server.cc                 |  6 +++---
 .../core/distributed_runtime/rpc/grpc_util.h   |  6 +++---
 .../rpc/grpc_worker_service.cc                 |  4 ++--
 .../rpc/grpc_worker_service_impl.cc            | 16 ++++++++--------
 .../rpc/grpc_worker_service_impl.h             | 18 +++++++++---------
 33 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/tensorflow/compiler/xla/rpc/grpc_client_test.cc b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
index 313f11a9a9..d7dd9786a2 100644
--- a/tensorflow/compiler/xla/rpc/grpc_client_test.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_client_test.cc
@@ -20,8 +20,8 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "grpc++/create_channel.h"
-#include "grpc++/security/credentials.h"
+#include "grpcpp/create_channel.h"
+#include "grpcpp/security/credentials.h"
 
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/xla_client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/rpc/grpc_service.h b/tensorflow/compiler/xla/rpc/grpc_service.h
index 5cd573167a..ca1b09b648 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service.h
+++ b/tensorflow/compiler/xla/rpc/grpc_service.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
 #define TENSORFLOW_COMPILER_XLA_RPC_GRPC_SERVICE_H_
 
-#include "grpc++/server_context.h"
+#include "grpcpp/server_context.h"
 #include "tensorflow/compiler/xla/rpc/xla_service.grpc.pb.h"
 #include "tensorflow/compiler/xla/service/service.h"
 
diff --git a/tensorflow/compiler/xla/rpc/grpc_service_main.cc b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
index e29908ccec..c68c857c30 100644
--- a/tensorflow/compiler/xla/rpc/grpc_service_main.cc
+++ b/tensorflow/compiler/xla/rpc/grpc_service_main.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 // Basic server binary that exposes a xla::Service through a GRPC interface
 // on a configurable port.
-#include "grpc++/security/server_credentials.h"
-#include "grpc++/server.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/security/server_credentials.h"
+#include "grpcpp/server.h"
+#include "grpcpp/server_builder.h"
 #include "tensorflow/compiler/xla/rpc/grpc_service.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/init_main.h"
diff --git a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
index 99485322c6..f80f5652af 100644
--- a/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/capture_tpu_profile.cc
@@ -18,7 +18,7 @@ limitations under the License.
 // Initiates a TPU profiling on the TPUProfiler service at service_addr,
 // receives and dumps the profile data to a tensorboard log directory.
 
-#include "grpc++/grpc++.h"
+#include "grpcpp/grpcpp.h"
 
 #include <cstdio>
 #include <ctime>
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service.cc b/tensorflow/contrib/verbs/grpc_verbs_service.cc
index 742f946c95..af29abd91f 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #ifdef TENSORFLOW_USE_VERBS
 
-#include "grpc++/alarm.h"
-#include "grpc++/grpc++.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/alarm.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_builder.h"
 
 #include "tensorflow/contrib/verbs/grpc_verbs_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
index 991f9a9d8b..4da7b59c69 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/contrib/verbs/grpc_verbs_service_impl.h"
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/channel_interface.h"
-#include "grpc++/impl/codegen/client_unary_call.h"
-#include "grpc++/impl/codegen/method_handler_impl.h"
-#include "grpc++/impl/codegen/rpc_service_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/channel_interface.h"
+#include "grpcpp/impl/codegen/client_unary_call.h"
+#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/rpc_service_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
index 1f0f10517e..abe5e08b07 100644
--- a/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
+++ b/tensorflow/contrib/verbs/grpc_verbs_service_impl.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
 #define TENSORFLOW_CONTRIB_GRPC_VERBS_SERVICE_IMPL_H_
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/impl/codegen/rpc_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/status.h"
-#include "grpc++/impl/codegen/stub_options.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 #include "tensorflow/contrib/verbs/verbs_service.pb.h"
 
diff --git a/tensorflow/core/debug/debug_grpc_testlib.h b/tensorflow/core/debug/debug_grpc_testlib.h
index 58361bf78f..8d3c9ff575 100644
--- a/tensorflow/core/debug/debug_grpc_testlib.h
+++ b/tensorflow/core/debug/debug_grpc_testlib.h
@@ -19,7 +19,7 @@ limitations under the License.
 #include <atomic>
 #include <unordered_set>
 
-#include "grpc++/grpc++.h"
+#include "grpcpp/grpcpp.h"
 #include "tensorflow/core/debug/debug_io_utils.h"
 #include "tensorflow/core/debug/debug_service.grpc.pb.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/debug/debug_io_utils.cc b/tensorflow/core/debug/debug_io_utils.cc
index 03a011f79e..9e8002d490 100644
--- a/tensorflow/core/debug/debug_io_utils.cc
+++ b/tensorflow/core/debug/debug_io_utils.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #ifndef PLATFORM_WINDOWS
-#include "grpc++/create_channel.h"
+#include "grpcpp/create_channel.h"
 #else
 // winsock2.h is used in grpc, so Ws2_32.lib is needed
 #pragma comment(lib, "Ws2_32.lib")
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 0826a90860..62b18a45b1 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <map>
 #include <memory>
 
-#include "grpc++/grpc++.h"
+#include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 4786c43ee2..b23466037f 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.h"
 
-#include "grpc++/generic/generic_stub.h"
+#include "grpcpp/generic/generic_stub.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_state.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
index 3fd7deaa86..39ab6856c5 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/channel_interface.h"
-#include "grpc++/impl/codegen/client_unary_call.h"
-#include "grpc++/impl/codegen/method_handler_impl.h"
-#include "grpc++/impl/codegen/rpc_service_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/channel_interface.h"
+#include "grpcpp/impl/codegen/client_unary_call.h"
+#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/rpc_service_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 namespace tensorflow {
 namespace eager {
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
index d7b192ac85..66458186ad 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_H_
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/impl/codegen/rpc_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/status.h"
-#include "grpc++/impl/codegen/stub_options.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 #include "tensorflow/core/protobuf/eager_service.pb.h"
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
index 65550caf64..e94aedf535 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service_impl.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_EAGER_GRPC_EAGER_SERVICE_IMPL_H_
 
-#include "grpc++/alarm.h"
-#include "grpc++/completion_queue.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/alarm.h"
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/server_builder.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 #include "tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_call.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_call.h b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
index ecad1274cc..90666def60 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_call.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_call.h
@@ -20,9 +20,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 
-#include "grpc++/grpc++.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/server_builder.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 613188244f..0ebc084cb6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include <map>
 #include <unordered_map>
 
-#include "grpc++/create_channel.h"
+#include "grpcpp/create_channel.h"
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 48b9d958aa..4861cdb691 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -22,7 +22,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "grpc++/grpc++.h"
+#include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
index d367b83ee7..6e7f5dbd13 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h
@@ -16,7 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_CLIENT_CQ_TAG_H_
 
-#include "grpc++/grpc++.h"
+#include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index e025e555dd..127dea2882 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -30,8 +30,8 @@ limitations under the License.
 // RunGraph on workers.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service.h"
 
-#include "grpc++/alarm.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/alarm.h"
+#include "grpcpp/server_builder.h"
 
 #include "tensorflow/core/distributed_runtime/master.h"
 #include "tensorflow/core/distributed_runtime/rpc/async_service_interface.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
index c832adbbbf..1cea1b1462 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h"
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/channel_interface.h"
-#include "grpc++/impl/codegen/client_unary_call.h"
-#include "grpc++/impl/codegen/method_handler_impl.h"
-#include "grpc++/impl/codegen/rpc_service_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/channel_interface.h"
+#include "grpcpp/impl/codegen/client_unary_call.h"
+#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/rpc_service_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
index 8f1b589698..751f2633e7 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service_impl.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_MASTER_SERVICE_IMPL_H_
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/impl/codegen/rpc_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/status.h"
-#include "grpc++/impl/codegen/stub_options.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 #include "tensorflow/core/protobuf/master.pb.h"
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 1acf1fb4fc..6008462d04 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <utility>
 
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
+#include "grpcpp/generic/generic_stub.h"
+#include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index e5ffb4ed2f..c0a9b43bf4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include <limits>
 #include <memory>
 
-#include "grpc++/grpc++.h"
-#include "grpc++/security/credentials.h"
-#include "grpc++/server_builder.h"
 #include "grpc/support/alloc.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/server_builder.h"
 
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index 0122df178a..b1c2eda0cf 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <memory>
 
-#include "grpc++/grpc++.h"
-#include "grpc++/security/credentials.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
 
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 59dbb7ae04..61c5bc285f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #include <utility>
 
-#include "grpc++/generic/generic_stub.h"
-#include "grpc++/grpc++.h"
+#include "grpcpp/generic/generic_stub.h"
+#include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/call_options.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
index e51894b4c7..d0684f1833 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "grpcpp/support/slice.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc
index 71f69e9024..7cace573e8 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_tensor_coding.h"
 
-#include "grpc++/support/byte_buffer.h"
-#include "grpc++/support/slice.h"
+#include "grpcpp/support/byte_buffer.h"
+#include "grpcpp/support/slice.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
index f247322bc4..e52b257411 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
@@ -16,9 +16,9 @@ limitations under the License.
 #include <iostream>
 #include <vector>
 
-#include "grpc++/grpc++.h"
-#include "grpc++/security/credentials.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/server_builder.h"
 
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
index e718db251c..33cbadda0a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include <vector>
 
-#include "grpc++/grpc++.h"
-#include "grpc++/security/credentials.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/server_builder.h"
 
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_util.h b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
index 4b58781b54..45259aa2ec 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_util.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_util.h
@@ -18,9 +18,9 @@ limitations under the License.
 
 #include <memory>
 
-#include "grpc++/grpc++.h"
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/support/byte_buffer.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/support/byte_buffer.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index aa9304a033..61f5369617 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <deque>
 
-#include "grpc++/alarm.h"
-#include "grpc++/server_builder.h"
+#include "grpcpp/alarm.h"
+#include "grpcpp/server_builder.h"
 
 #include "tensorflow/core/common_runtime/buf_rendezvous.h"
 #include "tensorflow/core/common_runtime/device.h"
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
index 38cc2b81d3..72b5e77f1c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.cc
@@ -15,14 +15,14 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h"
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/channel_interface.h"
-#include "grpc++/impl/codegen/client_unary_call.h"
-#include "grpc++/impl/codegen/method_handler_impl.h"
-#include "grpc++/impl/codegen/rpc_service_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/sync_stream.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/channel_interface.h"
+#include "grpcpp/impl/codegen/client_unary_call.h"
+#include "grpcpp/impl/codegen/method_handler_impl.h"
+#include "grpcpp/impl/codegen/rpc_service_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
index da270835bd..7915c3aafd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service_impl.h
@@ -16,15 +16,15 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
 #define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_GRPC_WORKER_SERVICE_IMPL_H_
 
-#include "grpc++/impl/codegen/async_stream.h"
-#include "grpc++/impl/codegen/async_unary_call.h"
-#include "grpc++/impl/codegen/proto_utils.h"
-#include "grpc++/impl/codegen/rpc_method.h"
-#include "grpc++/impl/codegen/service_type.h"
-#include "grpc++/impl/codegen/status.h"
-#include "grpc++/impl/codegen/stub_options.h"
-#include "grpc++/impl/codegen/sync_stream.h"
-#include "grpc++/support/byte_buffer.h"
+#include "grpcpp/impl/codegen/async_stream.h"
+#include "grpcpp/impl/codegen/async_unary_call.h"
+#include "grpcpp/impl/codegen/proto_utils.h"
+#include "grpcpp/impl/codegen/rpc_method.h"
+#include "grpcpp/impl/codegen/service_type.h"
+#include "grpcpp/impl/codegen/status.h"
+#include "grpcpp/impl/codegen/stub_options.h"
+#include "grpcpp/impl/codegen/sync_stream.h"
+#include "grpcpp/support/byte_buffer.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
-- 
GitLab


From 3b81d6e6055c529c00a165fd8e3431a6ba704e8e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 13:14:59 -0700
Subject: [PATCH 504/610] Optimizing transpose_conv.

PiperOrigin-RevId: 199839745
---
 .../kernels/internal/optimized/optimized_ops.h |  8 ++++----
 .../kernels/internal/reference/reference_ops.h |  8 ++++----
 .../contrib/lite/kernels/transpose_conv.cc     |  2 +-
 .../lite/kernels/transpose_conv_test.cc        | 18 +++++++++---------
 .../contrib/lite/toco/export_tensorflow.cc     |  2 +-
 .../propagate_fixed_sizes.cc                   |  2 +-
 .../contrib/lite/toco/import_tensorflow.cc     |  8 +++++---
 tensorflow/contrib/lite/toco/model.h           |  1 +
 tensorflow/contrib/lite/toco/tooling_util.cc   | 15 +++++++--------
 9 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index 0ce781db59..d2bee2cd70 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -6289,8 +6289,8 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
   // To optimize, start by using the conv code with transposed weights for the
   // case of stride_height = stride_width = 1.
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 3);
-  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int filter_height = ArraySize(filter_dims, 2);
@@ -6337,8 +6337,8 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
                   float input_value = input_data[Offset(input_dims, in_channel,
                                                         in_x, in_y, batch)];
                   float filter_value =
-                      filter_data[Offset(filter_dims, out_channel, filter_x,
-                                         filter_y, in_channel)];
+                      filter_data[Offset(filter_dims, in_channel, filter_x,
+                                         filter_y, out_channel)];
                   output_data[Offset(output_dims, out_channel, out_x, out_y,
                                      batch)] += input_value * filter_value;
                 }
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index 0b644a1fa6..c3f645bdf1 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3810,8 +3810,8 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
                           int pad_height, float* output_data,
                           const Dims<4>& output_dims) {
   const int batches = MatchingArraySize(input_dims, 3, output_dims, 3);
-  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 3);
-  const int output_depth = MatchingArraySize(filter_dims, 0, output_dims, 0);
+  const int input_depth = MatchingArraySize(input_dims, 0, filter_dims, 0);
+  const int output_depth = MatchingArraySize(filter_dims, 3, output_dims, 0);
   const int input_height = ArraySize(input_dims, 2);
   const int input_width = ArraySize(input_dims, 1);
   const int filter_height = ArraySize(filter_dims, 2);
@@ -3851,8 +3851,8 @@ inline void TransposeConv(const float* input_data, const Dims<4>& input_dims,
                   float input_value = input_data[Offset(input_dims, in_channel,
                                                         in_x, in_y, batch)];
                   float filter_value =
-                      filter_data[Offset(filter_dims, out_channel, filter_x,
-                                         filter_y, in_channel)];
+                      filter_data[Offset(filter_dims, in_channel, filter_x,
+                                         filter_y, out_channel)];
                   output_data[Offset(output_dims, out_channel, out_x, out_y,
                                      batch)] += input_value * filter_value;
                 }
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv.cc b/tensorflow/contrib/lite/kernels/transpose_conv.cc
index 3c99661029..e83b1ec987 100644
--- a/tensorflow/contrib/lite/kernels/transpose_conv.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_conv.cc
@@ -79,7 +79,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Ensure that weights and inputs have the same channel dimension.
   // Note: TOCO will reorder weights in the following format: OHWI.
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 3),
-                    SizeOfDimension(weights, 0));
+                    SizeOfDimension(weights, 3));
 
   if (!IsConstantTensor(output_shape)) {
     SetTensorToDynamic(output);
diff --git a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
index 52be089349..55df897180 100644
--- a/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
+++ b/tensorflow/contrib/lite/kernels/transpose_conv_test.cc
@@ -88,10 +88,10 @@ TEST(TransposeConvOpModelTest, SimpleTest) {
 // And filter value is derived by:
 // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[18, 1])
 TEST(TransposeConvOpModelTest, TwoFiltersTest) {
-  TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_SAME, 1, 1);
+  TransposeConvOpModel m({1, 4, 4, 2}, {1, 3, 3, 2}, Padding_SAME, 1, 1);
   m.PopulateTensor<int>(m.output_shape(), {1, 4, 4, 1});
-  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
-                                       8, 10, 12, 14, 16, 18});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+                                       13, 14, 15, 16, 17, 18});
   m.PopulateTensor<float>(
       m.input(),
       {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
@@ -117,10 +117,10 @@ TEST(TransposeConvOpModelTest, TwoFiltersTest) {
 // And filter value is derived by:
 // filter = tf.reshape(tf.transpose(filter, perm=[3, 0, 1, 2]), shape=[1, 18])
 TEST(TransposeConvOpModelTest, PaddingValidTest) {
-  TransposeConvOpModel m({1, 4, 4, 2}, {2, 3, 3, 1}, Padding_VALID, 1, 1);
+  TransposeConvOpModel m({1, 4, 4, 2}, {1, 3, 3, 2}, Padding_VALID, 1, 1);
   m.PopulateTensor<int>(m.output_shape(), {1, 6, 6, 1});
-  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
-                                       8, 10, 12, 14, 16, 18});
+  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+                                       13, 14, 15, 16, 17, 18});
   m.PopulateTensor<float>(
       m.input(),
       {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
@@ -171,10 +171,10 @@ TEST(TransposeConvOpModelTest, StrideValidTest) {
 //     [1, 2, 2, 1 ],
 //     "VALID")
 TEST(TransposeConvOpModelTest, MultiChannelTest) {
-  TransposeConvOpModel m({1, 2, 2, 1}, {1, 3, 3, 2}, Padding_VALID, 2, 2);
+  TransposeConvOpModel m({1, 2, 2, 1}, {2, 3, 3, 1}, Padding_VALID, 2, 2);
   m.PopulateTensor<int>(m.output_shape(), {1, 5, 5, 2});
-  m.PopulateTensor<float>(m.filter(), {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
-                                       13, 14, 15, 16, 17, 18});
+  m.PopulateTensor<float>(m.filter(), {1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6,
+                                       8, 10, 12, 14, 16, 18});
   m.PopulateTensor<float>(m.input(), {1, 2, 3, 4});
   m.Invoke();
 
diff --git a/tensorflow/contrib/lite/toco/export_tensorflow.cc b/tensorflow/contrib/lite/toco/export_tensorflow.cc
index 76ce1c5802..c7c80ab21c 100644
--- a/tensorflow/contrib/lite/toco/export_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/export_tensorflow.cc
@@ -494,7 +494,7 @@ void ConvertTransposeConvOperator(const Model& model,
   const auto& weights_array = model.GetArray(weights_array_name);
   CHECK(weights_array.buffer->type == ArrayDataType::kFloat);
   ConvertFloatTensorConst(model, weights_array_name, AxesOrder::kOHWI,
-                          AxesOrder::kHWIO, tensorflow_graph);
+                          AxesOrder::kHWOI, tensorflow_graph);
   auto& strides = (*conv2d_op->mutable_attr())["strides"];
   strides.mutable_list()->add_i(1);
   strides.mutable_list()->add_i(src_op.stride_height);
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 9e4262223e..170a499d4e 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -278,7 +278,7 @@ void ProcessTransposeConvOperator(Model* model, TransposeConvOperator* op) {
       << "TransposeConv input shape must have 4 dimensions. Input \""
       << op->inputs[TransposeConvOperator::WEIGHTS] << "\" had shape "
       << toco::ShapeToString(weights_shape) << ".";
-  CHECK_EQ(input_shape.dims(3), weights_shape.dims(0))
+  CHECK_EQ(input_shape.dims(3), weights_shape.dims(3))
       << "Input shape depth and weight depth do not agree";
 
   // Set the output shape according to the specified output shape.
diff --git a/tensorflow/contrib/lite/toco/import_tensorflow.cc b/tensorflow/contrib/lite/toco/import_tensorflow.cc
index 8dd43dda3e..a2241c85a7 100644
--- a/tensorflow/contrib/lite/toco/import_tensorflow.cc
+++ b/tensorflow/contrib/lite/toco/import_tensorflow.cc
@@ -1445,11 +1445,13 @@ void ConvertTransposeConvOperator(const NodeDef& node,
   if (existing_transpose) {
     CHECK(existing_transpose->type == OperatorType::kTranspose);
   } else {
-    // Transpose weights from HWIO order to OHWI order, which is more efficient
-    // for computation
+    // Transpose weights from HWOI order to OHWI order, which is more efficient
+    // for computation. (Note that TensorFlow considers the order as HWIO
+    // because they consider this a backward conv, inverting the sense of
+    // input/output.)
     TransposeOperator* transpose = new TransposeOperator;
     string perm_array = CreateConstArray<ArrayDataType::kInt32>(
-        model, node.name() + "_transpose_perm", {3, 0, 1, 2});
+        model, node.name() + "_transpose_perm", {2, 0, 1, 3});
     transpose->inputs = {weights_name, perm_array};
     transpose->outputs = {transposed_weights_name};
     model->operators.emplace_back(transpose);
diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 81beb29372..2ec36d27ef 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -155,6 +155,7 @@ enum class AxesOrder {
   k1HWO,     // Our standard for DepthwiseConv weights
   kHWIM,     // TensorFlow DepthwiseConv weights
   kNHWC,     // TensorFlow activations
+  kHWOI,     // TensorFlow back-prop conv weights
 };
 
 // The type of the scalars in an array.
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 5a82be3939..810718f610 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -1865,18 +1865,15 @@ void GetShuffleShape(AxesOrder input_axes_order, AxesOrder output_axes_order,
              output_axes_order == AxesOrder::kHWIO) {
     // 3210 <- 3210
     // HWIO <- OHWI
-    (*shuffle)[0] = 1;
-    (*shuffle)[1] = 2;
-    (*shuffle)[2] = 3;
-    (*shuffle)[3] = 0;
+    *shuffle = {1, 2, 3, 0};
   } else if (input_axes_order == AxesOrder::kHWIO &&
              output_axes_order == AxesOrder::kOHWI) {
     // 3210 <- 3210
     // OHWI <- HWIO
-    (*shuffle)[0] = 3;
-    (*shuffle)[1] = 0;
-    (*shuffle)[2] = 1;
-    (*shuffle)[3] = 2;
+    *shuffle = {3, 0, 1, 2};
+  } else if (input_axes_order == AxesOrder::kOHWI &&
+             output_axes_order == AxesOrder::kHWOI) {
+    *shuffle = {1, 2, 0, 3};
   } else {
     LOG(FATAL) << "Bad shuffle";
   }
@@ -2022,6 +2019,8 @@ int AxesCount(AxesOrder axes_order) {
       return 4;
     case AxesOrder::kNHWC:
       return 4;
+    case AxesOrder::kHWOI:
+      return 4;
     default:
       LOG(FATAL) << "Bad AxesOrder";
       return 0;
-- 
GitLab


From 70266a65f7fb1d58196eff5355f16d62aba64310 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 13:58:59 -0700
Subject: [PATCH 505/610] Avoid compilation of nodes that forward tensor refs.

PiperOrigin-RevId: 199846447
---
 tensorflow/compiler/jit/BUILD                 |  1 +
 .../compiler/jit/mark_for_compilation_pass.cc | 22 ++-----------------
 tensorflow/compiler/jit/xla_cluster_util.cc   | 22 +++++++++++++++++++
 tensorflow/compiler/jit/xla_cluster_util.h    |  3 +++
 .../compiler/jit/xla_fusion_optimizer.cc      |  7 ++++++
 5 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index ab8cd8f4bc..e2b614d91b 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -342,6 +342,7 @@ cc_library(
         "//tensorflow/compiler/jit/graphcycles",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/kernels:bounds_check",
     ],
 )
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 74468266b9..8c3882116d 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -44,12 +44,6 @@ namespace tensorflow {
 
 namespace {
 
-// Returns true if, when executed in TensorFlow, `node` is guaranteed to forward
-// a ref tensor input to its output.
-static bool AlwaysForwardsRefInput(const Node& node) {
-  return node.IsIdentity();
-}
-
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // There is a SymbolicGradient kernel on the XLA_JIT device, but the gradient
   // is really a kind of function call and will be handled by
@@ -68,20 +62,8 @@ bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
   // XLA does not offer guaranteed aliasing between the input and output of the
   // XLA cluster so it can't implement the forward-tensor-ref semantic.  Leave
   // such nodes out of XLA clusters.
-  if (AlwaysForwardsRefInput(node)) {
-    for (const Edge* incoming_edge : node.in_edges()) {
-      if (incoming_edge->IsControlEdge()) {
-        continue;
-      }
-
-      Node* incoming_node = incoming_edge->src();
-      if (IsRefType(incoming_node->output_type(incoming_edge->src_output()))) {
-        VLOG(2) << "Not clustering " << node.def().ShortDebugString()
-                << " because of ref input " << incoming_node->name() << " "
-                << incoming_node->type_string();
-        return false;
-      }
-    }
+  if (HasForwardedRefInput(node)) {
+    return false;
   }
 
   return FindKernelDef(jit_device_type, node.def(), nullptr, nullptr).ok();
diff --git a/tensorflow/compiler/jit/xla_cluster_util.cc b/tensorflow/compiler/jit/xla_cluster_util.cc
index 70bd10336b..05b7821b88 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.cc
+++ b/tensorflow/compiler/jit/xla_cluster_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <unordered_map>
 
+#include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/graph/control_flow.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/util/device_name_utils.h"
@@ -66,6 +67,9 @@ string DescribeCycle(const GraphCycles* cycles, const Graph& graph, int src,
   }
   return description;
 }
+
+bool AlwaysForwardsRefInput(const Node& node) { return node.IsIdentity(); }
+
 }  // namespace
 
 Status DeviceToDeviceType(const string& device, DeviceType* device_type) {
@@ -77,6 +81,24 @@ Status DeviceToDeviceType(const string& device, DeviceType* device_type) {
   return Status::OK();
 }
 
+bool HasForwardedRefInput(const Node& node) {
+  if (AlwaysForwardsRefInput(node)) {
+    for (const Edge* incoming_edge : node.in_edges()) {
+      if (incoming_edge->IsControlEdge()) {
+        continue;
+      }
+
+      Node* incoming_node = incoming_edge->src();
+      if (IsRefType(incoming_node->output_type(incoming_edge->src_output()))) {
+        VLOG(2) << "Node " << node.def().ShortDebugString() << " has ref input "
+                << incoming_node->name() << " " << incoming_node->type_string();
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 Status CreateCycleDetectionGraph(const Graph* graph, GraphCycles* cycles) {
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     // We rely on the node IDs in the cycle detection graph being consecutive
diff --git a/tensorflow/compiler/jit/xla_cluster_util.h b/tensorflow/compiler/jit/xla_cluster_util.h
index 5b673bdc27..bcce082aaf 100644
--- a/tensorflow/compiler/jit/xla_cluster_util.h
+++ b/tensorflow/compiler/jit/xla_cluster_util.h
@@ -36,6 +36,9 @@ using OrderedNodeSet = std::set<Node*, NodeComparatorID>;
 // Returns the DeviceType corresponding to 'device'.
 Status DeviceToDeviceType(const string& device, DeviceType* device_type);
 
+// Returns true if `node` has a ref tensor input that it forwards to its output.
+bool HasForwardedRefInput(const Node& node);
+
 // Creates a graph representation to enable cycle detection when clustering.
 // This representation handles loops in graph by disconnecting each loop from
 // the enclosing graph.
diff --git a/tensorflow/compiler/jit/xla_fusion_optimizer.cc b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
index 96016521ea..74257b09a8 100644
--- a/tensorflow/compiler/jit/xla_fusion_optimizer.cc
+++ b/tensorflow/compiler/jit/xla_fusion_optimizer.cc
@@ -178,6 +178,13 @@ Status XlaFusionOptimizer::Optimize(grappler::Cluster* cluster,
       continue;
     }
 
+    // XLA does not offer guaranteed aliasing between the input and output of
+    // the XLA cluster so it can't implement the forward-tensor-ref semantic.
+    // Leave such nodes out of XLA clusters.
+    if (HasForwardedRefInput(*node)) {
+      continue;
+    }
+
     compilation_candidates.insert(node);
   }
 
-- 
GitLab


From 77f0772c0ead3e1402615022649aad2a721265fd Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 14:14:49 -0700
Subject: [PATCH 506/610] Bugfix for dilated_conv optimizations. We were
 failing to create im2col arrays for dilated unstrided 1x1 cases.

PiperOrigin-RevId: 199849200
---
 tensorflow/contrib/lite/build_def.bzl                        | 2 +-
 tensorflow/contrib/lite/kernels/conv.cc                      | 4 +++-
 .../lite/toco/graph_transformations/create_im2col_arrays.cc  | 5 +++--
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/build_def.bzl b/tensorflow/contrib/lite/build_def.bzl
index 30bb604d17..612813caee 100644
--- a/tensorflow/contrib/lite/build_def.bzl
+++ b/tensorflow/contrib/lite/build_def.bzl
@@ -201,7 +201,7 @@ def generated_test_models():
         "concat",
         "constant",
         "control_dep",
-        # "conv",
+        "conv",
         "depthwiseconv",
         "div",
         "equal",
diff --git a/tensorflow/contrib/lite/kernels/conv.cc b/tensorflow/contrib/lite/kernels/conv.cc
index ee42e5cdc8..747c8a62c0 100644
--- a/tensorflow/contrib/lite/kernels/conv.cc
+++ b/tensorflow/contrib/lite/kernels/conv.cc
@@ -134,7 +134,9 @@ static TfLiteStatus AllocateTemporaryTensorsIfRequired(TfLiteContext* context,
   // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
   data->need_im2col =
       (params->stride_width != 1 || params->stride_height != 1 ||
-       filter_width != 1 || filter_height != 1);
+       params->dilation_width_factor != 1 ||
+       params->dilation_height_factor != 1 || filter_width != 1 ||
+       filter_height != 1);
   // If we're using the optimized multithreaded EigenTensor implementation of
   // convolution, it expects the filter weights to be transposed compared to
   // the normal TF Lite buffer format. Typical TF Lite weights are
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
index 076415ece8..8ca2cd66ac 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/create_im2col_arrays.cc
@@ -46,8 +46,9 @@ bool CreateIm2colArrays::Run(Model* model, std::size_t op_index) {
   const int kheight = weights_shape.dims(1);
   const int kwidth = weights_shape.dims(2);
   if (kwidth == 1 && kheight == 1 && conv_op->stride_width == 1 &&
-      conv_op->stride_height == 1) {
-    // 1x1 unstrided conv does not need an im2col array.
+      conv_op->stride_height == 1 && conv_op->dilation_width_factor == 1 &&
+      conv_op->dilation_height_factor == 1) {
+    // 1x1 unstrided undilated conv does not need an im2col array.
     return false;
   }
 
-- 
GitLab


From bc65583b2b4e3f48b6a724832ef96ab176666d33 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 14:58:43 -0700
Subject: [PATCH 507/610] Allow large allocations in toco.

PiperOrigin-RevId: 199855838
---
 tensorflow/contrib/lite/toco/model.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/model.h b/tensorflow/contrib/lite/toco/model.h
index 2ec36d27ef..2f43adb07b 100644
--- a/tensorflow/contrib/lite/toco/model.h
+++ b/tensorflow/contrib/lite/toco/model.h
@@ -1644,8 +1644,8 @@ struct SparseToDenseOperator : Operator {
 // be used for the transient array at hand. The 'start' and 'end' values are
 // offsets from the start of the workspace buffer, expressed in bytes.
 struct Alloc {
-  int start = 0;
-  int end = 0;
+  int64 start = 0;
+  int64 end = 0;
 };
 
 inline bool operator<(const Alloc& a, const Alloc& b) {
-- 
GitLab


From db717a72c20ab37974ec9076c8e406345c8776be Mon Sep 17 00:00:00 2001
From: AG Ramesh <ag.ramesh@intel.com>
Date: Fri, 8 Jun 2018 15:06:47 -0700
Subject: [PATCH 508/610] [INTEL MKL] Enable compilation of TF without MKL ML
 dependency Closes #19808.

PiperOrigin-RevId: 199857219
---
 .../xla/service/cpu/runtime_matmul_mkl.cc     |  2 +-
 .../core/common_runtime/mkl_cpu_allocator.h   |  6 ++-
 .../core/kernels/batch_matmul_op_complex.cc   |  2 +-
 .../core/kernels/batch_matmul_op_real.cc      |  2 +-
 tensorflow/core/kernels/matmul_op.cc          |  3 +-
 tensorflow/core/kernels/mkl_aggregate_ops.cc  | 11 +++--
 .../core/kernels/mkl_batch_matmul_op.cc       |  2 +-
 tensorflow/core/kernels/mkl_concat_op.cc      |  7 +--
 .../core/kernels/mkl_conv_grad_bias_ops.cc    |  2 +
 .../core/kernels/mkl_conv_grad_filter_ops.cc  |  8 ++--
 .../core/kernels/mkl_conv_grad_input_ops.cc   |  2 +
 .../core/kernels/mkl_fused_batch_norm_op.cc   |  8 ++--
 tensorflow/core/kernels/mkl_identity_op.cc    |  2 +
 .../core/kernels/mkl_input_conversion_op.cc   |  6 +--
 tensorflow/core/kernels/mkl_lrn_op.cc         | 10 ++--
 tensorflow/core/kernels/mkl_matmul_op.cc      |  2 +-
 tensorflow/core/kernels/mkl_relu_op.cc        |  7 +--
 tensorflow/core/kernels/mkl_reshape_op.cc     | 10 ++--
 tensorflow/core/kernels/mkl_softmax_op.cc     |  2 -
 tensorflow/core/kernels/mkl_tfconv_op.h       |  2 +
 tensorflow/core/kernels/mkl_transpose_op.cc   |  2 +-
 tensorflow/core/kernels/transpose_op.cc       |  2 +-
 tensorflow/core/kernels/transpose_op.h        |  4 +-
 tensorflow/core/util/mkl_util.h               | 47 +++++++++++++------
 24 files changed, 94 insertions(+), 57 deletions(-)

diff --git a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
index 92da5f71c2..f8c8dd5e93 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 #include "tensorflow/compiler/xla/service/cpu/runtime_matmul_mkl.h"
 #include "third_party/intel_mkl_ml/include/mkl_cblas.h"
 #include "third_party/intel_mkl_ml/include/mkl_service.h"
diff --git a/tensorflow/core/common_runtime/mkl_cpu_allocator.h b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
index 245320c896..29f702699f 100644
--- a/tensorflow/core/common_runtime/mkl_cpu_allocator.h
+++ b/tensorflow/core/common_runtime/mkl_cpu_allocator.h
@@ -29,7 +29,9 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/mem.h"
 
+#ifndef DO_NOT_USE_ML
 #include "i_malloc.h"
+#endif
 
 #ifdef _WIN32
 typedef unsigned int uint;
@@ -97,14 +99,14 @@ class MklCPUAllocator : public VisitableAllocator {
     VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes;
     allocator_ = new BFCAllocator(new MklSubAllocator, max_mem_bytes,
                                   kAllowGrowth, kName);
-
+#ifndef DO_NOT_USE_ML
     // For redirecting all allocations from MKL to this allocator
     // From: http://software.intel.com/en-us/node/528565
     i_malloc = MallocHook;
     i_calloc = CallocHook;
     i_realloc = ReallocHook;
     i_free = FreeHook;
-
+#endif
     return Status::OK();
   }
 
diff --git a/tensorflow/core/kernels/batch_matmul_op_complex.cc b/tensorflow/core/kernels/batch_matmul_op_complex.cc
index 96216764fd..b77c80c01f 100644
--- a/tensorflow/core/kernels/batch_matmul_op_complex.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_complex.cc
@@ -17,7 +17,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#if !defined(INTEL_MKL)
+#if !defined(INTEL_MKL) || defined(DO_NOT_USE_ML)
 TF_CALL_complex64(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_complex128(REGISTER_BATCH_MATMUL_CPU);
 #endif
diff --git a/tensorflow/core/kernels/batch_matmul_op_real.cc b/tensorflow/core/kernels/batch_matmul_op_real.cc
index 87a0795f2f..fe259c1634 100644
--- a/tensorflow/core/kernels/batch_matmul_op_real.cc
+++ b/tensorflow/core/kernels/batch_matmul_op_real.cc
@@ -21,7 +21,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-#if !defined(INTEL_MKL)
+#if !defined(INTEL_MKL) || defined(DO_NOT_USE_ML)
 TF_CALL_float(REGISTER_BATCH_MATMUL_CPU);
 TF_CALL_double(REGISTER_BATCH_MATMUL_CPU);
 #endif
diff --git a/tensorflow/core/kernels/matmul_op.cc b/tensorflow/core/kernels/matmul_op.cc
index f9c15ce6d7..fc3b3d3445 100644
--- a/tensorflow/core/kernels/matmul_op.cc
+++ b/tensorflow/core/kernels/matmul_op.cc
@@ -551,7 +551,8 @@ struct MatMulFunctor<SYCLDevice, T> {
                               .Label("cublas"),                    \
                           MatMulOp<GPUDevice, T, true /* cublas */>)
 
-#if defined(INTEL_MKL)
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
+
 // MKL does not support half and int32 types for matrix-multiplication, so
 // register the kernel to use default Eigen based implementations for these
 // types. Registration for NO-LABEL version is in mkl_matmul_op.cc
diff --git a/tensorflow/core/kernels/mkl_aggregate_ops.cc b/tensorflow/core/kernels/mkl_aggregate_ops.cc
index b539b00009..4ad858e4a9 100644
--- a/tensorflow/core/kernels/mkl_aggregate_ops.cc
+++ b/tensorflow/core/kernels/mkl_aggregate_ops.cc
@@ -24,15 +24,16 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
 using mkldnn::sum;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -333,7 +334,7 @@ class MklAddNOp : public OpKernel {
 
       if (!input1_in_mkl_format && src1_dims_size == 0) {
         Tensor* dst_tensor = nullptr;
-        MklShape mkl_shape_dst;
+        MklDnnShape mkl_shape_dst;
         mkl_shape_dst.SetMklTensor(false);
         AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
                                   src1_tensor.shape(), mkl_shape_dst);
@@ -347,7 +348,7 @@ class MklAddNOp : public OpKernel {
       if (!input1_in_mkl_format && !input2_in_mkl_format) {
         if (src1_tensor.shape().num_elements() == 0) {
           Tensor* dst_tensor = nullptr;
-          MklShape mkl_shape_dst;
+          MklDnnShape mkl_shape_dst;
           mkl_shape_dst.SetMklTensor(false);
           AllocateOutputSetMklShape(ctx, output_idx, &dst_tensor,
                                     src1_tensor.shape(), mkl_shape_dst);
diff --git a/tensorflow/core/kernels/mkl_batch_matmul_op.cc b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
index 723b445a75..45328b03d6 100644
--- a/tensorflow/core/kernels/mkl_batch_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_batch_matmul_op.cc
@@ -25,7 +25,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#if defined(INTEL_MKL)
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 #include <vector>
 #include "mkl_cblas.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
diff --git a/tensorflow/core/kernels/mkl_concat_op.cc b/tensorflow/core/kernels/mkl_concat_op.cc
index 9ab95d765c..5eeb23d810 100644
--- a/tensorflow/core/kernels/mkl_concat_op.cc
+++ b/tensorflow/core/kernels/mkl_concat_op.cc
@@ -26,16 +26,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/types.h"
 
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 
 using mkldnn::concat;
 using mkldnn::stream;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
index d23027a54d..c1da0ded1d 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_bias_ops.cc
@@ -38,8 +38,10 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
+#ifdef INTEL_MKL_ML
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#endif
 #include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
index e0706568b1..356eed8b67 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_filter_ops.cc
@@ -38,9 +38,6 @@ limitations under the License.
 #include "tensorflow/core/util/use_cudnn.h"
 #include "tensorflow/core/util/work_sharder.h"
 
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
@@ -49,8 +46,13 @@ using mkldnn::convolution_backward_weights;
 using mkldnn::memory;
 using mkldnn::prop_kind;
 using mkldnn::stream;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
+#include "tensorflow/core/util/mkl_util.h"
+
 namespace tensorflow {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
diff --git a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
index d203c04934..21b18f9119 100644
--- a/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_grad_input_ops.cc
@@ -23,8 +23,10 @@ limitations under the License.
 #define EIGEN_USE_THREADS
 #include <algorithm>
 #include <vector>
+#ifdef INTEL_MKL_ML
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#endif
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
index 62aafa7930..3fe660cf96 100644
--- a/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/mkl_fused_batch_norm_op.cc
@@ -21,21 +21,21 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_types.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
-
 using mkldnn::batch_normalization_backward;
 using mkldnn::batch_normalization_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
 using mkldnn::use_global_stats;
 using mkldnn::use_scale_shift;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
+#include "tensorflow/core/util/mkl_util.h"
 // TODO(inteltf) Address comments from PR 8968.
 
 namespace tensorflow {
diff --git a/tensorflow/core/kernels/mkl_identity_op.cc b/tensorflow/core/kernels/mkl_identity_op.cc
index 6c027f8e72..b02cc5384c 100644
--- a/tensorflow/core/kernels/mkl_identity_op.cc
+++ b/tensorflow/core/kernels/mkl_identity_op.cc
@@ -24,8 +24,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
+#ifdef INTEL_MKL_ML
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#endif
 #include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
diff --git a/tensorflow/core/kernels/mkl_input_conversion_op.cc b/tensorflow/core/kernels/mkl_input_conversion_op.cc
index 663228722b..dc4da33a06 100644
--- a/tensorflow/core/kernels/mkl_input_conversion_op.cc
+++ b/tensorflow/core/kernels/mkl_input_conversion_op.cc
@@ -369,8 +369,8 @@ class MklInputConversionOp : public OpKernel {
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
                                            op_data_type, has_avx512f_,
                                            kInputIndex_1);
-      SetDummyMklShapeOutput(context, kInputIndex_0);
-      SetDummyMklShapeOutput(context, kInputIndex_1);
+      SetDummyMklDnnShapeOutput(context, kInputIndex_0);
+      SetDummyMklDnnShapeOutput(context, kInputIndex_1);
       return;
     }
 
@@ -458,7 +458,7 @@ class MklInputConversionOp : public OpKernel {
       MklToTfOp<Device, T>::ConvertMklToTf(this, context, data_format_str,
                                            op_data_type, has_avx512f_,
                                            mkl_tensor_index);
-      SetDummyMklShapeOutput(context, mkl_tensor_index);
+      SetDummyMklDnnShapeOutput(context, mkl_tensor_index);
 
       // The tensor in TF format passes through
       ForwardTfTensorInToOut(context, tf_tensor_index, tf_tensor_index);
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index eef254cdad..dfe50e6a7f 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,8 +22,6 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -31,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -45,8 +42,13 @@ using mkldnn::lrn_backward;
 using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
+#include "tensorflow/core/util/mkl_util.h"
+
 namespace tensorflow {
 
 namespace {
@@ -1236,7 +1238,7 @@ class MklLRNGradOp : public OpKernel {
     auto activations = orig_output_tensor.shaped<T, 2>({nodes * batch, depth});
 
     Tensor* output_dnn_data;
-    MklShape mkl_output_mkl_shape;
+    MklDnnShape mkl_output_mkl_shape;
     mkl_output_mkl_shape.SetMklTensor(false);
     mkl_output_mkl_shape.SetDimensions(4);
     AllocateOutputSetMklShape(context, kIdxOutput, &output_dnn_data,
diff --git a/tensorflow/core/kernels/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl_matmul_op.cc
index dfa6cecc9b..62c0404891 100644
--- a/tensorflow/core/kernels/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl_matmul_op.cc
@@ -23,7 +23,7 @@ limitations under the License.
 // and when it is undefined at build time, this file becomes an empty
 // compilation unit
 
-#if defined(INTEL_MKL)
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 
 #include "mkl_cblas.h"
 #include "tensorflow/core/framework/op.h"
diff --git a/tensorflow/core/kernels/mkl_relu_op.cc b/tensorflow/core/kernels/mkl_relu_op.cc
index 1ed43834dd..78abbdb730 100644
--- a/tensorflow/core/kernels/mkl_relu_op.cc
+++ b/tensorflow/core/kernels/mkl_relu_op.cc
@@ -23,9 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/errors.h"
 
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
@@ -38,7 +35,11 @@ using mkldnn::prop_kind;
 using mkldnn::relu_backward;
 using mkldnn::relu_forward;
 using mkldnn::stream;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
+#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/kernels/mkl_reshape_op.cc b/tensorflow/core/kernels/mkl_reshape_op.cc
index 2cfde1f6fd..c44a6f3477 100644
--- a/tensorflow/core/kernels/mkl_reshape_op.cc
+++ b/tensorflow/core/kernels/mkl_reshape_op.cc
@@ -24,15 +24,17 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
 using mkldnn::stream;
+#else
+#include "mkl_dnn.h"
+#include "mkl_dnn_types.h"
 #endif
 
+#include "tensorflow/core/util/mkl_util.h"
+
 namespace tensorflow {
 using CPUDevice = Eigen::ThreadPoolDevice;
 template <typename Device, typename T>
@@ -250,7 +252,7 @@ class MklReshapeOp : public OpKernel {
                 memory::primitive_desc(output_tf_md, cpu_engine);
 
             Tensor* output_tensor = nullptr;
-            MklShape mkl_shape_output;
+            MklDnnShape mkl_shape_output;
             mkl_shape_output.SetMklTensor(false);
             // We allocate output tensor in the shape expected by Reshape.
             AllocateOutputSetMklShape(context, kOutputSlotIdx, &output_tensor,
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index f79e18cff2..638392954e 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/util/tensor_format.h"
 
-#include "mkldnn.h"
-#include "mkldnn_types.h"
 #include "tensorflow/core/util/mkl_util.h"
 
 #include "mkldnn.hpp"
diff --git a/tensorflow/core/kernels/mkl_tfconv_op.h b/tensorflow/core/kernels/mkl_tfconv_op.h
index 4120f013ac..7e8ed1b1d6 100644
--- a/tensorflow/core/kernels/mkl_tfconv_op.h
+++ b/tensorflow/core/kernels/mkl_tfconv_op.h
@@ -32,8 +32,10 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/util/tensor_format.h"
 
+#ifdef INTEL_MKL_ML
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
+#endif
 #include "tensorflow/core/util/mkl_util.h"
 
 #ifndef INTEL_MKL_ML
diff --git a/tensorflow/core/kernels/mkl_transpose_op.cc b/tensorflow/core/kernels/mkl_transpose_op.cc
index 3f07b317c4..b180c2ff20 100644
--- a/tensorflow/core/kernels/mkl_transpose_op.cc
+++ b/tensorflow/core/kernels/mkl_transpose_op.cc
@@ -15,7 +15,7 @@ limitations under the License.
 
 // See docs in ../ops/array_ops.cc.
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 #define EIGEN_USE_THREADS
 
 #include "mkl_trans.h"
diff --git a/tensorflow/core/kernels/transpose_op.cc b/tensorflow/core/kernels/transpose_op.cc
index 7177ad7888..886b3e7492 100644
--- a/tensorflow/core/kernels/transpose_op.cc
+++ b/tensorflow/core/kernels/transpose_op.cc
@@ -218,7 +218,7 @@ Status ConjugateTransposeCpuOp::DoTranspose(OpKernelContext* ctx,
                                             perm, out);
 }
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 #define REGISTER(T)                                   \
   REGISTER_KERNEL_BUILDER(Name("Transpose")           \
                               .Device(DEVICE_CPU)     \
diff --git a/tensorflow/core/kernels/transpose_op.h b/tensorflow/core/kernels/transpose_op.h
index ae67592d04..709b0a92e9 100644
--- a/tensorflow/core/kernels/transpose_op.h
+++ b/tensorflow/core/kernels/transpose_op.h
@@ -42,7 +42,7 @@ class TransposeCpuOp : public TransposeOp {
                      gtl::ArraySlice<int32> perm, Tensor* out) override;
 };
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 class MklTransposeCpuOp : public TransposeOp {
  public:
   explicit MklTransposeCpuOp(OpKernelConstruction* ctx) : TransposeOp(ctx) {}
@@ -85,7 +85,7 @@ class ConjugateTransposeCpuOp : public TransposeOp {
   bool IsConjugate() const override { return true; }
 };
 
-#ifdef INTEL_MKL
+#if defined(INTEL_MKL) && !defined(DO_NOT_USE_ML)
 class MklConjugateTransposeCpuOp : public TransposeOp {
  public:
   explicit MklConjugateTransposeCpuOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 8a3ece7b8c..dffc965b14 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -22,10 +22,13 @@ limitations under the License.
 #include <unordered_map>
 #include <utility>
 
+#ifdef INTEL_MKL_ML
 #include "mkl_dnn.h"
 #include "mkl_dnn_types.h"
 #include "mkl_service.h"
 #include "mkl_trans.h"
+#endif
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -51,11 +54,12 @@ using mkldnn::reorder;
 typedef unsigned int uint;
 #endif
 
-// The file contains a number of utility classes and functions used by MKL
-// enabled kernels
 
 namespace tensorflow {
 
+// The file contains a number of utility classes and functions used by MKL
+// enabled kernels
+
 // This class encapsulates all the meta data that is associated with an MKL
 // tensor. A tensor is an MKL tensor if it was created as the result of an
 // MKL operation, and did not go through a conversion to a standard
@@ -71,6 +75,7 @@ typedef enum {
   Dim_I = 1
 } MklDnnDims;
 
+#ifdef INTEL_MKL_ML
 class MklShape {
  public:
   MklShape() {}
@@ -331,7 +336,7 @@ class MklShape {
       nullptr;  // TF dimension corresponding to this MKL dimension
 };
 
-#ifndef INTEL_MKL_ML
+#else
 
 // Forward decl
 TensorFormat MklDnnDataFormatToTFDataFormat(memory::format format);
@@ -664,12 +669,14 @@ class MklDnnShape {
 
 // List of MklShape objects. Used in Concat/Split layers.
 
-typedef std::vector<MklShape> MklShapeList;
 
 #ifndef INTEL_MKL_ML
 typedef std::vector<MklDnnShape> MklDnnShapeList;
+#else
+typedef std::vector<MklShape> MklShapeList;
 #endif
 
+#ifdef INTEL_MKL_ML
 // Check if all tensors specified by MklShapes are MKL tensors.
 inline bool AreAllMklTensors(const MklShapeList& shapes) {
   for (auto& s : shapes) {
@@ -680,7 +687,6 @@ inline bool AreAllMklTensors(const MklShapeList& shapes) {
   return true;
 }
 
-#ifdef INTEL_MKL_ML
 template <typename T>
 inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
                              const MklShape& mkl_shape) {
@@ -720,6 +726,7 @@ inline Tensor ConvertMklToTF(OpKernelContext* context, const Tensor& mkl_tensor,
 #endif
 
 // Get the MKL shape from the second string tensor
+#ifdef INTEL_MKL_ML
 inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
   mklshape->DeSerializeMklShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -730,8 +737,7 @@ inline void GetMklShape(OpKernelContext* ctext, int n, MklShape* mklshape) {
               .size() *
           sizeof(uint8));
 }
-
-#ifndef INTEL_MKL_ML
+#else
 inline void GetMklShape(OpKernelContext* ctext, int n, MklDnnShape* mklshape) {
   mklshape->DeSerializeMklDnnShape(
       ctext->input(GetTensorMetaDataIndex(n, ctext->num_inputs()))
@@ -805,6 +811,7 @@ inline TensorShape GetTfShape(OpKernelContext* context, size_t input_idx) {
 }
 #endif
 
+#ifdef INTEL_MKL_ML
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -820,7 +827,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifndef INTEL_MKL_ML
+#else
 // Allocate the second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -837,6 +844,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
 }
 #endif
 
+#ifdef INTEL_MKL_ML
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -857,7 +865,7 @@ inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
       second_tensor->flat<uint8>().size() * sizeof(uint8));
 }
 
-#ifndef INTEL_MKL_ML
+#else
 // Allocate the output tensor, create a second output tensor that will contain
 // the MKL shape serialized
 inline void AllocateOutputSetMklShape(OpKernelContext* ctext, int n,
@@ -892,8 +900,7 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                                                  tf_shape, tensor_out));
   *buf_out = static_cast<void*>(tensor_out->flat<T>().data());
 }
-#endif
-
+#else
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            dnnLayout_t lt_buff, void** buf_out) {
   TensorShape tf_shape;
@@ -907,6 +914,7 @@ inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
   *buf_out = static_cast<void*>(tensor_out->flat<float>().data());
 }
 
+#endif
 template <typename T>
 inline void AllocTmpBuffer(OpKernelContext* context, Tensor* tensor_out,
                            TensorShape tf_shape) {
@@ -930,6 +938,7 @@ inline void GetStridesFromSizes(TensorFormat data_format, size_t* strides,
   }
 }
 
+#ifdef INTEL_MKL_ML
 inline void MklSizesToTFSizes(OpKernelContext* context,
                               TensorFormat data_format_,
                               const MklShape& mkl_shape,
@@ -955,6 +964,7 @@ inline void MklSizesToTFSizes(OpKernelContext* context,
 
   OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape(sizes, tf_shape));
 }
+#endif
 
 inline int32 GetMklTensorDimIndex(char dimension) {
   switch (dimension) {
@@ -972,12 +982,14 @@ inline int32 GetMklTensorDimIndex(char dimension) {
   }
 }
 
+#ifdef INTEL_MKL_ML
 inline int64 GetMklTensorDim(const MklShape& mkl_shape, char dimension) {
   int index = GetMklTensorDimIndex(dimension);
   CHECK(index >= 0 && index < mkl_shape.GetDimension())
       << "Invalid index from the dimension: " << index << ", " << dimension;
   return mkl_shape.dim_size(index);
 }
+#endif
 
 inline void CopyMklTensorInToOut(OpKernelContext* context, int idx_in,
                                  int idx_out) {
@@ -1097,6 +1109,14 @@ inline void ForwardMklTensorInToOut(OpKernelContext* context, int idx_in,
 }
 
 #ifndef INTEL_MKL_ML
+// Set a dummy MKLDNN shape (called when the output is in TF format)
+inline void SetDummyMklDnnShapeOutput(OpKernelContext* context,
+                                      uint32 idx_data_out) {
+  MklDnnShape mkl_shape_output;
+  mkl_shape_output.SetMklTensor(false);
+  AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
+}
+
 inline void ForwardMklTensorInToOutWithMklShape(OpKernelContext* context,
                                                 int idx_in, int idx_out,
                                                 const MklDnnShape& mkl_shape) {
@@ -1132,6 +1152,7 @@ inline void ForwardMklMetaDataInToOut(OpKernelContext* context,
   }
 }
 
+#ifdef INTEL_MKL_ML
 // Set a dummy MKL shape (called when the output is in TF format)
 inline void SetDummyMklShapeOutput(OpKernelContext* context,
                                    uint32 idx_data_out) {
@@ -1139,8 +1160,6 @@ inline void SetDummyMklShapeOutput(OpKernelContext* context,
   mkl_shape_output.SetMklTensor(false);
   AllocateOutputSetMklShape(context, idx_data_out, mkl_shape_output);
 }
-
-#ifdef INTEL_MKL_ML
 // We don't need these functions in MKLDNN. We have defined equality operator
 // on MklDnnShape class directly.
 
@@ -1210,7 +1229,6 @@ inline bool MklCompareShapes(const TensorShape* input_shape_0,
 
   return true;
 }
-#endif
 
 // These functions do not compile with MKL-DNN since mkl.h is missing.
 // We may need to remove them later.
@@ -1248,6 +1266,7 @@ inline void MklNCHWToNHWC(const Tensor& input, Tensor** output) {
   }
 }
 
+#endif
 // -------------------------------------------------------------------
 
 #ifndef INTEL_MKL_ML
-- 
GitLab


From 60dccab365de5089dbf3a680b7234e5b158362cd Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Fri, 8 Jun 2018 15:18:23 -0700
Subject: [PATCH 509/610] [tf.data] Print an actionable warning when a lookup
 table is created in a function.

PiperOrigin-RevId: 199859228
---
 .../contrib/data/python/ops/grouping.py       | 10 ++++++
 .../contrib/data/python/ops/scan_ops.py       |  2 ++
 .../data/kernel_tests/map_dataset_op_test.py  | 21 +++++++++++++
 tensorflow/python/data/ops/dataset_ops.py     | 31 +++++++++++++++++++
 tensorflow/python/data/ops/readers.py         |  3 ++
 5 files changed, 67 insertions(+)

diff --git a/tensorflow/contrib/data/python/ops/grouping.py b/tensorflow/contrib/data/python/ops/grouping.py
index ea229b5b27..520f784228 100644
--- a/tensorflow/contrib/data/python/ops/grouping.py
+++ b/tensorflow/contrib/data/python/ops/grouping.py
@@ -300,6 +300,7 @@ class GroupByReducerDataset(dataset_ops.Dataset):
         raise ValueError(
             "`key_func` must return a single tf.int64 tensor. "
             "Got type=%s and shape=%s" % (ret.dtype, ret.get_shape()))
+      dataset_ops._warn_if_collections("tf.contrib.data.group_by_reducer()")  # pylint: disable=protected-access
       return ret
 
     self._key_func = tf_key_func
@@ -327,6 +328,8 @@ class GroupByReducerDataset(dataset_ops.Dataset):
       self._state_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
+      dataset_ops._warn_if_collections("tf.contrib.data.group_by_reducer()")  # pylint: disable=protected-access
+
       # Serialize any sparse tensors.
       ret = nest.pack_sequence_as(
           ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
@@ -398,6 +401,8 @@ class GroupByReducerDataset(dataset_ops.Dataset):
                  nest.pack_sequence_as(self._state_types,
                                        [t.dtype for t in flat_new_state])))
 
+        dataset_ops._warn_if_collections("tf.contrib.data.group_by_reducer()")  # pylint: disable=protected-access
+
         # Serialize any sparse tensors.
         ret = nest.pack_sequence_as(
             ret,
@@ -464,6 +469,8 @@ class GroupByReducerDataset(dataset_ops.Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
+      dataset_ops._warn_if_collections("tf.contrib.data.group_by_reducer()")  # pylint: disable=protected-access
+
       # Serialize any sparse tensors.
       ret = nest.pack_sequence_as(
           ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
@@ -525,6 +532,7 @@ class GroupByWindowDataset(dataset_ops.Dataset):
       if window_size.dtype != dtypes.int64:
         raise ValueError(
             "`window_size_func` must return a single tf.int64 tensor.")
+      dataset_ops._warn_if_collections("tf.contrib.data.group_by_window()")  # pylint: disable=protected-access
       return window_size
 
     self._window_size_func = tf_window_size_func
@@ -557,6 +565,7 @@ class GroupByWindowDataset(dataset_ops.Dataset):
       ret = ops.convert_to_tensor(ret, dtype=dtypes.int64)
       if ret.dtype != dtypes.int64:
         raise ValueError("`key_func` must return a single tf.int64 tensor.")
+      dataset_ops._warn_if_collections("tf.contrib.data.group_by_window()")  # pylint: disable=protected-access
       return ret
 
     self._key_func = tf_key_func
@@ -580,6 +589,7 @@ class GroupByWindowDataset(dataset_ops.Dataset):
       self._output_classes = output_dataset.output_classes
       self._output_types = output_dataset.output_types
       self._output_shapes = output_dataset.output_shapes
+      dataset_ops._warn_if_collections("tf.contrib.data.group_by_window()")  # pylint: disable=protected-access
       return output_dataset._as_variant_tensor()  # pylint: disable=protected-access
 
     self._reduce_func = tf_reduce_func
diff --git a/tensorflow/contrib/data/python/ops/scan_ops.py b/tensorflow/contrib/data/python/ops/scan_ops.py
index e911ad0fa0..9909ca8d9d 100644
--- a/tensorflow/contrib/data/python/ops/scan_ops.py
+++ b/tensorflow/contrib/data/python/ops/scan_ops.py
@@ -148,6 +148,8 @@ class _ScanDataset(dataset_ops.Dataset):
         self._output_types = nest.pack_sequence_as(
             output_value, [t.dtype for t in nest.flatten(output_value)])
 
+        dataset_ops._warn_if_collections("tf.contrib.data.scan()")  # pylint: disable=protected-access
+
         # Serialize any sparse tensors.
         new_state = nest.pack_sequence_as(new_state, [
             t for t in nest.flatten(sparse.serialize_sparse_tensors(new_state))
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
index 1ad0b9de5e..768d4ac82c 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from collections import namedtuple
 import threading
 import time
+import warnings
 
 import numpy as np
 
@@ -638,6 +639,26 @@ class MapDatasetTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testWarnOnLookupTable(self):
+    def collecting_function(x):
+      _ = lookup_ops.HashTable(
+          lookup_ops.KeyValueTensorInitializer([], []), 0.0, name="t1")
+      return x
+
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      _ = dataset_ops.Dataset.range(10).map(collecting_function)
+    # NOTE(mrry): Python 3 prints other warnings in addition to the one we are
+    # testing, so we search for the expected warning.
+    self.assertGreaterEqual(len(w), 1)
+    found_warning = False
+    for warning in w:
+      if ("Creating lookup tables inside a function passed to Dataset.map() is "
+          "not supported." in str(warning)):
+        found_warning = True
+        break
+    self.assertTrue(found_warning)
+
 
 class MapDatasetBenchmark(test.Benchmark):
 
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 8b2a2e0a32..2ec6c6f154 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import abc
 import threading
+import warnings
 
 import numpy as np
 import six
@@ -1865,6 +1866,24 @@ def _should_unpack_args(args):
   return type(args) is tuple  # pylint: disable=unidiomatic-typecheck
 
 
+def _warn_if_collections(transformation_name):
+  """Prints warning message if the current graph uses common graph collections.
+
+  NOTE(mrry): Currently a warning is only generated for lookup tables. Any
+  variables created will be automatically hoisted out to the outermost scope
+  using `init_scope()`. Some collections (such as for control-flow contexts)
+  are benign and should not generate a warning.
+
+  Args:
+    transformation_name: A human-readable name for the transformation.
+  """
+  if ops.get_default_graph().get_collection(ops.GraphKeys.TABLE_INITIALIZERS):
+    warnings.warn("Creating lookup tables inside a function passed to %s is not"
+                  " supported. Create each table outside the function, and "
+                  "capture it inside the function to use it."
+                  % transformation_name)
+
+
 class MapDataset(Dataset):
   """A `Dataset` that maps a function over elements in its input."""
 
@@ -1924,6 +1943,8 @@ class MapDataset(Dataset):
       self._output_types = nest.pack_sequence_as(
           ret, [t.dtype for t in nest.flatten(ret)])
 
+      _warn_if_collections("Dataset.map()")
+
       # Serialize any sparse tensors.
       ret = nest.pack_sequence_as(
           ret, [t for t in nest.flatten(sparse.serialize_sparse_tensors(ret))])
@@ -2012,6 +2033,8 @@ class FlatMapDataset(Dataset):
       if not isinstance(dataset, Dataset):
         raise TypeError("`map_func` must return a `Dataset` object.")
 
+      _warn_if_collections(self._transformation_name())
+
       self._output_classes = dataset.output_classes
       self._output_types = dataset.output_types
       self._output_shapes = dataset.output_shapes
@@ -2043,6 +2066,9 @@ class FlatMapDataset(Dataset):
   def output_types(self):
     return self._output_types
 
+  def _transformation_name(self):
+    return "Dataset.flat_map()"
+
 
 class InterleaveDataset(FlatMapDataset):
   """A `Dataset` that maps a function over its input and interleaves the result.
@@ -2068,6 +2094,9 @@ class InterleaveDataset(FlatMapDataset):
         output_shapes=nest.flatten(
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
+  def _transformation_name(self):
+    return "Dataset.interleave()"
+
 
 class FilterDataset(Dataset):
   """A `Dataset` that filters its input according to a predicate function."""
@@ -2102,6 +2131,8 @@ class FilterDataset(Dataset):
               ret.shape.is_compatible_with(tensor_shape.scalar())):
         raise ValueError("`predicate` must return a scalar boolean tensor.")
 
+      _warn_if_collections("Dataset.filter()")
+
       return ret
 
     self._predicate = tf_predicate
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index a73a8b5cdc..6a72ed380f 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -156,6 +156,9 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
             sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
     # pylint: enable=protected-access
 
+  def _transformation_name(self):
+    return "tf.contrib.data.parallel_interleave()"
+
 
 @tf_export("data.TFRecordDataset")
 class TFRecordDataset(dataset_ops.Dataset):
-- 
GitLab


From aba275157880076c8fe39c5ecac48741938223c5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 15:21:34 -0700
Subject: [PATCH 510/610] Replace cout with VLOG(2).

PiperOrigin-RevId: 199859711
---
 tensorflow/core/grappler/optimizers/remapper.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index efd870b118..4dde7ed1b4 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -200,8 +200,8 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        std::cout << "Optimizing fused batch norm node " << node.DebugString()
-                  << std::endl;
+        VLOG(2) << "Optimizing fused batch norm node " << node.DebugString()
+                << std::endl;
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
-- 
GitLab


From c552838d342cb6e5243a88b9e08d38b95c2b2291 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 15:39:25 -0700
Subject: [PATCH 511/610] Add TensorArrayGradWithShape op.

PiperOrigin-RevId: 199862180
---
 .../api_def_TensorArrayGradWithShape.pbtxt    | 40 ++++++++++++++
 .../api_def_TensorArrayGradWithShape.pbtxt    |  4 ++
 tensorflow/core/kernels/tensor_array.cc       | 10 +++-
 tensorflow/core/kernels/tensor_array.h        |  4 +-
 tensorflow/core/kernels/tensor_array_ops.cc   | 46 +++++++++++++---
 tensorflow/core/ops/data_flow_ops.cc          | 44 +++++++++++++++
 .../kernel_tests/tensor_array_ops_test.py     | 54 +++++++++++++++++++
 tensorflow/python/ops/tensor_array_grad.py    |  1 +
 8 files changed, 192 insertions(+), 11 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_TensorArrayGradWithShape.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_TensorArrayGradWithShape.pbtxt

diff --git a/tensorflow/core/api_def/base_api/api_def_TensorArrayGradWithShape.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorArrayGradWithShape.pbtxt
new file mode 100644
index 0000000000..dd37b94ffa
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorArrayGradWithShape.pbtxt
@@ -0,0 +1,40 @@
+op {
+  graph_op_name: "TensorArrayGradWithShape"
+  endpoint {
+    name: "TensorArrayGradWithShape"
+  }
+  in_arg {
+    name: "handle"
+    description: <<END
+The handle to the forward TensorArray.
+END
+  }
+  in_arg {
+    name: "flow_in"
+    description: <<END
+A float scalar that enforces proper chaining of operations.
+END
+  }
+  in_arg {
+    name: "shape_to_prepend"
+    description: <<END
+An int32 vector representing a shape. Elements in the gradient accumulator will
+have shape which is this shape_to_prepend value concatenated with shape of the
+elements in the TensorArray corresponding to the input handle.
+END
+  }
+  attr {
+    name: "source"
+    description: <<END
+The gradient source string, used to decide which gradient TensorArray
+to return.
+END
+  }
+  summary: "Creates a TensorArray for storing multiple gradients of values in the given handle."
+  description: <<END
+Similar to TensorArrayGradV3. However it creates an accumulator with an
+expanded shape compared to the input TensorArray whose gradient is being
+computed. This enables multiple gradients for the same TensorArray to be
+calculated using the same accumulator.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorArrayGradWithShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorArrayGradWithShape.pbtxt
new file mode 100644
index 0000000000..5d76c112a0
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorArrayGradWithShape.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorArrayGradWithShape"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/tensor_array.cc b/tensorflow/core/kernels/tensor_array.cc
index 7b85ff2ea4..765467bc1e 100644
--- a/tensorflow/core/kernels/tensor_array.cc
+++ b/tensorflow/core/kernels/tensor_array.cc
@@ -81,7 +81,8 @@ TF_CALL_complex128(TENSOR_ARRAY_SET_ZERO_GPU);
 
 std::atomic<int64> TensorArray::tensor_array_counter{0};
 
-Status TensorArray::CopyShapesFrom(TensorArray* rhs) {
+Status TensorArray::CopyShapesFrom(TensorArray* rhs,
+                                   const TensorShape* shape_to_prepend) {
   mutex_lock l(mu_);
   mutex_lock l_rhs(rhs->mu_);
   TF_RETURN_IF_ERROR(LockedReturnIfClosed());
@@ -97,7 +98,12 @@ Status TensorArray::CopyShapesFrom(TensorArray* rhs) {
     if (!rhs->tensors_[i].written) continue;
 
     // Copy the shape over.
-    tensors_[i].shape = rhs->tensors_[i].shape;
+    if (shape_to_prepend) {
+      tensors_[i].shape = *shape_to_prepend;
+      tensors_[i].shape.AppendShape(rhs->tensors_[i].shape);
+    } else {
+      tensors_[i].shape = rhs->tensors_[i].shape;
+    }
     // Mark as written.  Reads will know that if written is true and
     // read is false, and cleared is false, to return zeros of the
     // appropriate shape.  Future aggregating writes will only use the shape
diff --git a/tensorflow/core/kernels/tensor_array.h b/tensorflow/core/kernels/tensor_array.h
index 90b71e370c..68fab85770 100644
--- a/tensorflow/core/kernels/tensor_array.h
+++ b/tensorflow/core/kernels/tensor_array.h
@@ -325,13 +325,15 @@ class TensorArray : public ResourceBase {
   bool HasIdenticalElementShapes() const { return identical_element_shapes_; }
 
   // Copy the TensorShapes from another TensorArray into this one.
+  // If `shapes_to_prepend` is set, expands the rank of the copied shape by
+  // prepending the passed in shape prefix to the shape values in `rhs`.
   // The sizes of the two TensorArrays must match and this one
   // may not have any entries filled in.  This performs a "soft copy",
   // essentially filling the current TensorArray with virtual
   // zero-tensors, which will be replaced by future aggregate writes,
   // or instantiated by future reads.  Requires a non-const pointer
   // to the rhs to access its mutex.
-  Status CopyShapesFrom(TensorArray* rhs);
+  Status CopyShapesFrom(TensorArray* rhs, const TensorShape* shape_to_prepend);
 
   // Clear the TensorArray, including any Tensor references, and mark as closed.
   void ClearAndMarkClosed() {
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index ef9748b1aa..37803ec775 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -264,7 +264,10 @@ REGISTER_GPU(bfloat16);
 #endif  // GOOGLE_CUDA
 
 // GRADIENT *******************************************************************
-
+// Note that this op may have an optional third input. If present, it represents
+// a shape value. It indicates that element shape of this gradient array is that
+// shape value concatenated with the element shape of the original tensor array.
+// See TensorArrayGradWithShape.
 class TensorArrayGradOp : public TensorArrayCreationOp {
  public:
   explicit TensorArrayGradOp(OpKernelConstruction* context)
@@ -325,18 +328,38 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
           "previous write?  Gradient calculation is impossible when multiple "
           "writes are performed to the same index.");
     }
+    TensorShape shape_to_prepend;
+    auto element_shape = PartialTensorShape();
+    if (ctx->num_inputs() > 2) {
+      TF_RETURN_IF_ERROR(
+          ctx->op_kernel().MakeShape(ctx->input(2), &shape_to_prepend));
+      auto ta_element_shape = tensor_array->ElemShape();
+      if (!ta_element_shape.unknown_rank()) {
+        std::vector<int64> dims;
+        for (auto dim : shape_to_prepend) {
+          dims.push_back(dim.size);
+        }
+        for (auto dim : ta_element_shape) {
+          dims.push_back(dim.size);
+        }
+        TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
+            gtl::ArraySlice<int64>(dims), &element_shape));
+      }
+    } else {
+      element_shape = tensor_array->ElemShape();
+    }
 
     const auto key = strings::StrCat(output_handle(0), output_handle(1));
     auto creator = [this, key, tensor_array, array_size, marked_size,
-                    tensor_array_output_handle,
+                    element_shape, shape_to_prepend, tensor_array_output_handle,
                     output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           key, tensor_array->ElemType(), *tensor_array_output_handle,
-          array_size, tensor_array->ElemShape(),
-          tensor_array->HasIdenticalElementShapes(), false /* dynamic_size */,
-          true /* multiple_writes_aggregate */, true /* is_grad */,
-          marked_size /* marked_size */, true /* close_after_read */);
-      return (*ret)->CopyShapesFrom(tensor_array);
+          array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
+          false /* dynamic_size */, true /* multiple_writes_aggregate */,
+          true /* is_grad */, marked_size /* marked_size */,
+          true /* close_after_read */);
+      return (*ret)->CopyShapesFrom(tensor_array, &shape_to_prepend);
     };
 
     Status s = rm->LookupOrCreate<TensorArray>(
@@ -361,7 +384,8 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2").Device(DEVICE_CPU),
                         TensorArrayGradOp);
 REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3").Device(DEVICE_CPU),
                         TensorArrayGradOp);
-
+REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape").Device(DEVICE_CPU),
+                        TensorArrayGradOp);
 REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad")
                             .Device(DEVICE_GPU)
                             .HostMemory("handle")
@@ -377,6 +401,12 @@ REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3")
                             .HostMemory("handle")
                             .HostMemory("grad_handle"),
                         TensorArrayGradOp);
+REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("handle")
+                            .HostMemory("shape_to_prepend")
+                            .HostMemory("grad_handle"),
+                        TensorArrayGradOp);
 
 // WRITE **********************************************************************
 
diff --git a/tensorflow/core/ops/data_flow_ops.cc b/tensorflow/core/ops/data_flow_ops.cc
index 3112f35da4..eed0bce174 100644
--- a/tensorflow/core/ops/data_flow_ops.cc
+++ b/tensorflow/core/ops/data_flow_ops.cc
@@ -608,6 +608,50 @@ REGISTER_OP("TensorArrayGradV3")
       return Status::OK();
     });
 
+REGISTER_OP("TensorArrayGradWithShape")
+    .Input("handle: resource")
+    .Input("flow_in: float")
+    .Input("shape_to_prepend: int32")
+    .Output("grad_handle: resource")
+    .Output("flow_out: float")
+    .Attr("source: string")
+    .SetIsStateful()
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle handle;
+      DimensionHandle unused_dim;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &handle));
+      TF_RETURN_IF_ERROR(c->WithValue(c->Dim(handle, 0), 2, &unused_dim));
+      c->set_output(0, c->Vector(2));
+      c->set_output(1, c->Scalar());
+      auto* shape_and_type = c->input_handle_shapes_and_types(0);
+      if (shape_and_type) {
+        auto input_shape = (*shape_and_type)[0].shape;
+        auto dtype = (*shape_and_type)[0].dtype;
+        // Note that shape_to_preped is a rank 1 Tensor representing a shape.
+        // The size of dimension 0 is the number of dimensions we need to add to
+        // output shape.
+        int64 prepend_rank = c->Value(c->Dim(c->input(2), 0));
+        if (c->RankKnown(input_shape) &&
+            prepend_rank != InferenceContext::kUnknownDim) {
+          int32 input_rank = c->Rank(input_shape);
+          std::vector<DimensionHandle> dims;
+          dims.reserve(prepend_rank + input_rank);
+          for (int i = 0; i < prepend_rank; ++i) {
+            dims.push_back(c->UnknownDim());
+          }
+          for (int i = 0; i < input_rank; ++i) {
+            dims.push_back(c->Dim(input_shape, i));
+          }
+          c->set_output_handle_shapes_and_types(0,
+                                                {{c->MakeShape(dims), dtype}});
+        } else {
+          c->set_output_handle_shapes_and_types(0,
+                                                {{c->UnknownShape(), dtype}});
+        }
+      }
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorArrayWriteV3")
     .Input("handle: resource")
     .Input("index: int32")
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index c0b36f143d..ea06357804 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -26,11 +26,13 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
@@ -549,6 +551,58 @@ class TensorArrayTest(test.TestCase):
                   dtypes.complex64, dtypes.complex128):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
+  def testTensorArrayGradWithShapeKnownElementShape(self):
+    with self.test_session(use_gpu=True) as sess:
+      ta = tensor_array_ops.TensorArray(
+          size=3,
+          dtype=dtypes.float32,
+          element_shape=tensor_shape.TensorShape([2, 3]))
+      handle, flow = data_flow_ops.tensor_array_grad_with_shape(
+          handle=ta.handle,
+          flow_in=ta.flow,
+          shape_to_prepend=tensor_shape.TensorShape([4, 5]),
+          source="source")
+      ta_grad = tensor_array_ops.TensorArray(
+          dtypes.float32, handle=handle, flow=flow)
+      value = array_ops.placeholder(dtypes.float32)
+      ta_grad = ta_grad.write(0, value)
+      read_value = ta_grad.read(0)
+
+      # Make sure shape inference worked.
+      self.assertAllEqual([None, None, 2, 3], read_value.shape.as_list())
+      # Writing with wrong shape should not work.
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   "Could not write to TensorArray"):
+        fed_value = np.random.random([2, 3])
+        sess.run(read_value, feed_dict={value: fed_value})
+      # Writing with correct shape should work.
+      fed_value = np.random.random([4, 5, 2, 3])
+      self.assertAllClose(fed_value,
+                          sess.run(read_value, feed_dict={value: fed_value}))
+
+  def testTensorArrayGradWithShapeUnknownElementShape(self):
+    with self.test_session(use_gpu=True) as sess:
+      ta = tensor_array_ops.TensorArray(
+          size=3, dtype=dtypes.float32,
+          element_shape=None)  # Note that element_shape is unknown
+      handle, flow = data_flow_ops.tensor_array_grad_with_shape(
+          handle=ta.handle,
+          flow_in=ta.flow,
+          shape_to_prepend=tensor_shape.TensorShape([4, 5]),
+          source="source")
+      ta_grad = tensor_array_ops.TensorArray(
+          dtypes.float32, handle=handle, flow=flow)
+      value = array_ops.placeholder(dtypes.float32)
+      ta_grad = ta_grad.write(0, value)
+      read_value = ta_grad.read(0)
+
+      # Make sure shape inference worked.
+      self.assertIsNone(read_value.shape.ndims)
+      # Write with some shape and check read value.
+      fed_value = np.random.random([4, 5, 7])
+      self.assertAllClose(fed_value,
+                          sess.run(read_value, feed_dict={value: fed_value}))
+
   @test_util.run_in_graph_and_eager_modes()
   def testMultiTensorArray(self):
     with self.test_session(use_gpu=True):
diff --git a/tensorflow/python/ops/tensor_array_grad.py b/tensorflow/python/ops/tensor_array_grad.py
index 1f70d69548..d341349804 100644
--- a/tensorflow/python/ops/tensor_array_grad.py
+++ b/tensorflow/python/ops/tensor_array_grad.py
@@ -34,6 +34,7 @@ ops.NotDifferentiable("TensorArrayCloseV2")
 
 ops.NotDifferentiable("TensorArrayV3")
 ops.NotDifferentiable("TensorArrayGradV3")
+ops.NotDifferentiable("TensorArrayGradWithShape")
 ops.NotDifferentiable("TensorArraySizeV3")
 ops.NotDifferentiable("TensorArrayCloseV3")
 
-- 
GitLab


From 5ad54de7b77f8ebed8db0f99ef93cede46daecc3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 15:41:38 -0700
Subject: [PATCH 512/610] Split out HloSliceInstruction as subclasses from
 HloInstruction.

PiperOrigin-RevId: 199862467
---
 .../compiler/xla/service/hlo_instruction.cc   | 94 ++++++++++---------
 .../compiler/xla/service/hlo_instruction.h    | 65 +++----------
 .../compiler/xla/service/hlo_instructions.cc  | 63 +++++++++++++
 .../compiler/xla/service/hlo_instructions.h   | 56 +++++++++++
 4 files changed, 182 insertions(+), 96 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index a778a6a965..f0fec77c31 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -165,6 +165,19 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       instruction = CreateMap(proto.shape(), map_operands, computations(0));
       break;
     }
+    case HloOpcode::kSlice: {
+      CHECK_EQ(proto.operand_ids_size(), 1);
+      std::vector<int64> slice_starts, slice_limits, slice_strides;
+      for (const HloInstructionProto::SliceDimensions& slice_dimensions :
+           proto.slice_dimensions()) {
+        slice_starts.push_back(slice_dimensions.start());
+        slice_limits.push_back(slice_dimensions.limit());
+        slice_strides.push_back(slice_dimensions.stride());
+      }
+      instruction = CreateSlice(proto.shape(), operands(0), slice_starts,
+                                slice_limits, slice_strides);
+      break;
+    }
     default: {
       instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -241,12 +254,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->dot_dimension_numbers_ =
         MakeUnique<DotDimensionNumbers>(proto.dot_dimension_numbers());
   }
-  for (const HloInstructionProto::SliceDimensions& slice_dimensions :
-       proto.slice_dimensions()) {
-    instruction->slice_starts_.push_back(slice_dimensions.start());
-    instruction->slice_limits_.push_back(slice_dimensions.limit());
-    instruction->slice_strides_.push_back(slice_dimensions.stride());
-  }
+
   instruction->exponent_bits_ = proto.exponent_bits();
   instruction->mantissa_bits_ = proto.mantissa_bits();
   for (int64 dynamic_slice_size : proto.dynamic_slice_sizes()) {
@@ -627,18 +635,8 @@ HloInstruction::CreateGenerateToken(
     tensorflow::gtl::ArraySlice<int64> start_indices,
     tensorflow::gtl::ArraySlice<int64> limit_indices,
     tensorflow::gtl::ArraySlice<int64> strides) {
-  auto instruction = WrapUnique(new HloInstruction(HloOpcode::kSlice, shape));
-  instruction->AppendOperand(operand);
-  instruction->slice_starts_.assign(start_indices.begin(), start_indices.end());
-  instruction->slice_limits_.assign(limit_indices.begin(), limit_indices.end());
-  instruction->slice_strides_.assign(strides.begin(), strides.end());
-  // For backward compatibility with old serialized computations: if there are
-  // no strides, assume all strides are 1.
-  // TODO(b/63317920): remove this code.
-  if (instruction->slice_strides_.empty()) {
-    instruction->slice_strides_ = std::vector<int64>(start_indices.size(), 1LL);
-  }
-  return instruction;
+  return MakeUnique<HloSliceInstruction>(shape, operand, start_indices,
+                                         limit_indices, strides);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateDynamicSlice(
@@ -1322,6 +1320,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kTranspose:
     case HloOpcode::kBroadcast:
     case HloOpcode::kMap:
+    case HloOpcode::kSlice:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1453,11 +1452,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       CHECK_EQ(new_operands.size(), 1);
       clone = CreateReshape(shape, new_operands[0]);
       break;
-    case HloOpcode::kSlice:
-      CHECK_EQ(new_operands.size(), 1);
-      clone = CreateSlice(shape, new_operands[0], slice_starts_, slice_limits_,
-                          slice_strides_);
-      break;
     case HloOpcode::kDynamicSlice:
       clone = CreateDynamicSlice(shape, new_operands[0], new_operands[1],
                                  dynamic_slice_sizes_);
@@ -1838,10 +1832,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kPad:
       return protobuf_util::ProtobufEquals(padding_config(),
                                            other.padding_config());
-    case HloOpcode::kSlice:
-      return slice_starts_ == other.slice_starts_ &&
-             slice_limits_ == other.slice_limits_ &&
-             slice_strides_ == other.slice_strides_;
     case HloOpcode::kCall:
     case HloOpcode::kCrossReplicaSum:
       return eq_computations(to_apply(), other.to_apply());
@@ -1887,6 +1877,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kTranspose:
     case HloOpcode::kBroadcast:
     case HloOpcode::kMap:
+    case HloOpcode::kSlice:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -2256,19 +2247,7 @@ std::vector<string> HloInstruction::ExtraAttributesToString(
     extra.push_back(
         StrCat("padding=", xla::PaddingConfigToString(*padding_config_)));
   }
-  if (opcode() == HloOpcode::kSlice) {
-    std::vector<string> bounds;
-    bounds.reserve(slice_starts_.size());
-    const bool omit_stride =
-        std::all_of(slice_strides_.begin(), slice_strides_.end(),
-                    [](int64 stride) { return stride == 1; });
-    for (int i = 0; i < slice_starts_.size(); ++i) {
-      string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]);
-      bounds.push_back(StrCat("[", slice_starts_[i], ":", slice_limits_[i],
-                              stride_str, "]"));
-    }
-    extra.push_back(StrCat("slice={", Join(bounds, ", "), "}"));
-  }
+
   if (opcode() == HloOpcode::kDynamicSlice) {
     extra.push_back(
         StrCat("dynamic_slice_sizes={", Join(dynamic_slice_sizes(), ","), "}"));
@@ -2464,12 +2443,7 @@ HloInstructionProto HloInstruction::ToProto() const {
       proto.add_gather_window_bounds(bound);
     }
   }
-  for (int i = 0; i < slice_starts_.size(); ++i) {
-    auto* slice_dimension = proto.add_slice_dimensions();
-    slice_dimension->set_start(slice_starts_[i]);
-    slice_dimension->set_limit(slice_limits_[i]);
-    slice_dimension->set_stride(slice_strides_[i]);
-  }
+
   proto.set_exponent_bits(exponent_bits_);
   proto.set_mantissa_bits(mantissa_bits_);
   for (int64 slice_size : dynamic_slice_sizes_) {
@@ -3572,4 +3546,32 @@ bool HloInstruction::IsRank2Transpose() const {
   auto transpose = DynCast<HloTransposeInstruction>(this);
   return transpose != nullptr && transpose->IsRank2Transpose();
 }
+
+int64 HloInstruction::slice_starts(int64 dimension) const {
+  return Cast<HloSliceInstruction>(this)->slice_starts(dimension);
+}
+
+const std::vector<int64>& HloInstruction::slice_starts() const {
+  return Cast<HloSliceInstruction>(this)->slice_starts();
+}
+
+int64 HloInstruction::slice_limits(int64 dimension) const {
+  return Cast<HloSliceInstruction>(this)->slice_limits(dimension);
+}
+
+const std::vector<int64>& HloInstruction::slice_limits() const {
+  return Cast<HloSliceInstruction>(this)->slice_limits();
+}
+
+int64 HloInstruction::slice_strides(int64 dimension) const {
+  return Cast<HloSliceInstruction>(this)->slice_strides(dimension);
+}
+
+const std::vector<int64>& HloInstruction::slice_strides() const {
+  return Cast<HloSliceInstruction>(this)->slice_strides();
+}
+
+bool HloInstruction::IsInPlaceSlice() const {
+  return Cast<HloSliceInstruction>(this)->IsInPlaceSlice();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index d252533eb2..5c5def58d3 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1189,48 +1189,6 @@ class HloInstruction {
     return FuseInstructionInternal(instruction_to_fuse, /* add_output */ true);
   }
 
-  // Returns the start index in the given dimension for a slice node.
-  //
-  // Precondition: opcode() == HloOpcode::kSlice
-  int64 slice_starts(int64 dimension) const {
-    CHECK_EQ(HloOpcode::kSlice, opcode_);
-    return slice_starts_[dimension];
-  }
-  const std::vector<int64>& slice_starts() const { return slice_starts_; }
-
-  // Returns the (exclusive) limit index in the given dimension for a slice
-  // node.
-  //
-  // Precondition: opcode() == HloOpcode::kSlice
-  int64 slice_limits(int64 dimension) const {
-    CHECK_EQ(HloOpcode::kSlice, opcode_);
-    return slice_limits_[dimension];
-  }
-  const std::vector<int64>& slice_limits() const {
-    CHECK_EQ(HloOpcode::kSlice, opcode_);
-    return slice_limits_;
-  }
-
-  // Returns the stride in the given dimension for a slice node.
-  //
-  // Precondition: opcode() == HloOpcode::kSlice
-  int64 slice_strides(int64 dimension) const {
-    CHECK_EQ(HloOpcode::kSlice, opcode_);
-    return slice_strides_[dimension];
-  }
-  const std::vector<int64>& slice_strides() const { return slice_strides_; }
-
-  // Returns the flag that describes whether a slice must be lowered into an
-  // offset into the original operand.
-  bool IsInPlaceSlice() const { return is_in_place_slice_; }
-
-  // Sets and returns the flag that describes whether a slice must be lowered
-  // into an offset into the original operand.
-  bool SetIsInPlaceSlice(bool value) {
-    is_in_place_slice_ = value;
-    return value;
-  }
-
   // Returns the size of the slice in the given dimension for a dynamic
   // slice node.
   //
@@ -1526,6 +1484,21 @@ class HloInstruction {
 
   // Returns whether this instruction does a rank-2 transposition.
   bool IsRank2Transpose() const;
+
+  // Delegates to HloSliceInstruction::slice_start.
+  int64 slice_starts(int64 dimension) const;
+  const std::vector<int64>& slice_starts() const;
+
+  // Delegates to HloSliceInstruction::slice_limits.
+  int64 slice_limits(int64 dimension) const;
+  const std::vector<int64>& slice_limits() const;
+
+  // Delegates to HloSliceInstruction::slice_strides.
+  int64 slice_strides(int64 dimension) const;
+  const std::vector<int64>& slice_strides() const;
+
+  // Delegates to HloSliceInstruction::IsInPlaceSlice.
+  bool IsInPlaceSlice() const;
   // Old methods kept for smooth subclassing transition END.
 
  protected:
@@ -1679,14 +1652,6 @@ class HloInstruction {
   std::unique_ptr<GatherDimensionNumbers> gather_dimension_numbers_;
   std::vector<int64> gather_window_bounds_;
 
-  // Describes the [begin, end) index range for a slice.
-  std::vector<int64> slice_starts_;
-  std::vector<int64> slice_limits_;
-  std::vector<int64> slice_strides_;
-
-  // Describes whether the slice can be lowered to an offset into the operand.
-  bool is_in_place_slice_ = false;
-
   // The bit sizes for a reduce-precision operation.
   int32 exponent_bits_ = 0;
   int32 mantissa_bits_ = 0;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index e987bd6d86..56792f8b1b 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -523,4 +523,67 @@ std::unique_ptr<HloInstruction> HloMapInstruction::CloneWithNewOperandsImpl(
     HloCloneContext* context) const {
   return MakeUnique<HloMapInstruction>(shape, new_operands, to_apply());
 }
+
+HloSliceInstruction::HloSliceInstruction(
+    const Shape& shape, HloInstruction* operand,
+    tensorflow::gtl::ArraySlice<int64> start_indices,
+    tensorflow::gtl::ArraySlice<int64> limit_indices,
+    tensorflow::gtl::ArraySlice<int64> strides)
+    : HloInstruction(HloOpcode::kSlice, shape),
+      slice_starts_(start_indices.begin(), start_indices.end()),
+      slice_limits_(limit_indices.begin(), limit_indices.end()),
+      slice_strides_(strides.begin(), strides.end()) {
+  AppendOperand(operand);
+  // For backward compatibility with old serialized computations: if there are
+  // no strides, assume all strides are 1.
+  // TODO(b/63317920): remove this code.
+  if (slice_strides_.empty()) {
+    slice_strides_ = std::vector<int64>(start_indices.size(), 1LL);
+  }
+}
+
+HloInstructionProto HloSliceInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  for (int i = 0; i < slice_starts_.size(); ++i) {
+    auto* slice_dimension = proto.add_slice_dimensions();
+    slice_dimension->set_start(slice_starts_[i]);
+    slice_dimension->set_limit(slice_limits_[i]);
+    slice_dimension->set_stride(slice_strides_[i]);
+  }
+  return proto;
+}
+
+std::vector<string> HloSliceInstruction::ExtraAttributesToStringImpl(
+    const HloPrintOptions& options) const {
+  std::vector<string> bounds;
+  bounds.reserve(slice_starts_.size());
+  const bool omit_stride =
+      std::all_of(slice_strides_.begin(), slice_strides_.end(),
+                  [](int64 stride) { return stride == 1; });
+  for (int i = 0; i < slice_starts_.size(); ++i) {
+    string stride_str = omit_stride ? "" : StrCat(":", slice_strides_[i]);
+    bounds.push_back(
+        StrCat("[", slice_starts_[i], ":", slice_limits_[i], stride_str, "]"));
+  }
+  return {StrCat("slice={", Join(bounds, ", "), "}")};
+}
+
+bool HloSliceInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& other_slice = static_cast<const HloSliceInstruction&>(other);
+  return slice_starts_ == other_slice.slice_starts_ &&
+         slice_limits_ == other_slice.slice_limits_ &&
+         slice_strides_ == other_slice.slice_strides_;
+}
+
+std::unique_ptr<HloInstruction> HloSliceInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  CHECK_EQ(new_operands.size(), 1);
+  return MakeUnique<HloSliceInstruction>(shape, new_operands[0], slice_starts_,
+                                         slice_limits_, slice_strides_);
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index c8c34f3406..18e786d8b6 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -377,6 +377,62 @@ class HloMapInstruction : public HloInstruction {
   std::vector<int64> dimensions_;
 };
 
+class HloSliceInstruction : public HloInstruction {
+ public:
+  explicit HloSliceInstruction(const Shape& shape, HloInstruction* operand,
+                               tensorflow::gtl::ArraySlice<int64> start_indices,
+                               tensorflow::gtl::ArraySlice<int64> limit_indices,
+                               tensorflow::gtl::ArraySlice<int64> strides);
+
+  HloInstructionProto ToProto() const override;
+
+  // Returns the start index in the given dimension for a slice node.
+  int64 slice_starts(int64 dimension) const { return slice_starts_[dimension]; }
+  const std::vector<int64>& slice_starts() const { return slice_starts_; }
+
+  // Returns the (exclusive) limit index in the given dimension for a slice
+  // node.
+  int64 slice_limits(int64 dimension) const { return slice_limits_[dimension]; }
+  const std::vector<int64>& slice_limits() const { return slice_limits_; }
+
+  // Returns the stride in the given dimension for a slice node.
+  int64 slice_strides(int64 dimension) const {
+    return slice_strides_[dimension];
+  }
+  const std::vector<int64>& slice_strides() const { return slice_strides_; }
+
+  // Returns the flag that describes whether a slice must be lowered into an
+  // offset into the original operand.
+  bool IsInPlaceSlice() const { return is_in_place_slice_; }
+
+  // Sets and returns the flag that describes whether a slice must be lowered
+  // into an offset into the original operand.
+  bool SetIsInPlaceSlice(bool value) {
+    is_in_place_slice_ = value;
+    return value;
+  }
+
+ private:
+  std::vector<string> ExtraAttributesToStringImpl(
+      const HloPrintOptions& options) const override;
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+
+  // Describes the [begin, end) index range for a slice.
+  std::vector<int64> slice_starts_;
+  std::vector<int64> slice_limits_;
+  std::vector<int64> slice_strides_;
+
+  // Describes whether the slice can be lowered to an offset into the operand.
+  bool is_in_place_slice_ = false;
+};
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
-- 
GitLab


From 426ea5b2c229f31ec8e0df4c474f464fc764c365 Mon Sep 17 00:00:00 2001
From: Billy Lamberta <blamb@google.com>
Date: Fri, 8 Jun 2018 15:47:19 -0700
Subject: [PATCH 513/610] Copy edits to Keras guide, formatting, moving some
 things around. Make the right TOC nav more useful.

PiperOrigin-RevId: 199863216
---
 .../docs_src/programmers_guide/keras.md       | 870 ++++++++----------
 1 file changed, 389 insertions(+), 481 deletions(-)

diff --git a/tensorflow/docs_src/programmers_guide/keras.md b/tensorflow/docs_src/programmers_guide/keras.md
index 6a9df12a25..c6aca7ebf4 100644
--- a/tensorflow/docs_src/programmers_guide/keras.md
+++ b/tensorflow/docs_src/programmers_guide/keras.md
@@ -1,334 +1,304 @@
 # Keras
 
-## What's Keras?
-
-Keras is a high-level API specification for building and training deep learning
-models, suitable for fast prototyping, advanced research, and production.
-It offers three key advantages:
-
-- **User friendliness.** Keras follows best practices for reducing
-    cognitive load: it offers consistent & simple interfaces,
-    it minimizes the number of user actions required for common use cases,
-    and it provides clear and actionable feedback upon user error.
-- **Modularity and composability.** A Keras model is composed of
-    fully-configurable building blocks that can be plugged together
-    with as few restrictions as possible -- like Lego bricks.
-- **Easy extensibility.** You can easily write your own building blocks
-    (such as new layers, new loss functions, new models where you write
-    the forward pass from scratch). This allows for total expressiveness,
-    making Keras suitable for advanced research.
-
-
-## What's tf.keras?
-
-`tf.keras` is TensorFlow's implementation of the Keras API specification, that
-serves as the TensorFlow high-level API: it's how you build models in TensorFlow.
-`tf.keras` seamlessly integrates with the rest of the TensorFlow API
-(such as `tf.data` input pipelines), bringing you the full power and flexibility
-of TensorFlow through an easy-to-use interface.
-
-You can import `tf.keras` via:
+Keras is a high-level API to build and train deep learning models. It's used for
+fast prototyping, advanced research, and production, with three key advantages:
+
+- *User friendly*<br>
+  Keras has a simple, consistent interface optimized for common use cases. It
+  provides clear and actionable feedback for user errors.
+- *Modular and composable*<br>
+  Keras models are made by connecting configurable building blocks together,
+  with few restrictions.
+- *Easy to extend*<br> Write custom building blocks to express new ideas for
+  research. Create new layers, loss functions, and develop state-of-the-art
+  models.
+
+## Import tf.keras
+
+`tf.keras` is TensorFlow's implementation of the
+[Keras API specification](https://keras.io){:.external}. This is a high-level
+API to build and train models that includes first-class support for
+TensorFlow-specific functionality, such as [eager execution](#eager_execution),
+`tf.data` pipelines, and [Estimators](/programmers_guide/estimators).
+`tf.keras` makes TensorFlow easier to use without sacrificing flexibility and
+performance.
+
+To get started, import `tf.keras` as part of your TensorFlow program setup:
 
 ```python
+import tensorflow as tf
 from tensorflow import keras
 ```
 
-What follows is a quick introduction to the basics of `tf.keras`.
+`tf.keras` can run any Keras-compatible code, but keep in mind:
 
+* The `tf.keras` version in the latest TensorFlow release might not be the same
+  as the latest `keras` version from PyPI. Check `tf.keras.__version__`.
+* When [saving a model's weights](#weights_only), `tf.keras` defaults to the
+  [checkpoint format](/get_started/checkpoints). Pass `save_format='h5'` to use
+  HDF5.
 
-## Table of contents
+## Build a simple model
 
-- [Getting started: the Sequential model](#getting-started-the-sequential-model)
-- [Configuring layers](#configuring-layers)
-- [Configuring training](#configuring-training)
-- [Training and evaluation](#training-and-evaluation)
-- [Building advanced models: the functional API](#building-advanced-models-the-functional-api)
-- [Building fully-customizable research models: the Model subclassing API](#building-fully-customizable-research-models-the-model-subclassing-api)
-- [Callbacks](#callbacks)
-- [Saving and serialization](#saving-and-serialization)
-- [Developing custom layers](#developing-custom-layers)
-- [Eager execution](#eager-execution)
-- [Further reading](#further-reading)
-- [FAQ](#faq)
+### Sequential model
 
+In Keras, you assemble *layers* to build *models*. A model is (usually) a graph
+of layers. The most common type of model is a stack of layers: the
+`tf.keras.Sequential` model.
 
----
-
-## Getting started: the Sequential model
-
-In `tf.keras`, you're assembling together **layers** to build **models**.
-A model is generally a graph of layers.
-The most common type of model is just a stack of layers: the `Sequential` class.
-
-Here's how to build a simple fully-connected network (multi-layer perceptron):
+To build a simple, fully-connected network (i.e. multi-layer perceptron):
 
 ```python
-from tensorflow import keras
-from tensorflow.keras import layers
-
 model = keras.Sequential()
-# This adds to the model a densely-connected layer with 64 units:
-model.add(Dense(64, activation='relu'))
-# Another one:
-model.add(Dense(64, activation='relu'))
-# This adds a softmax layer with 10 output units:
-model.add(Dense(10, activation='softmax'))
+# Adds a densely-connected layer with 64 units to the model:
+model.add(keras.layers.Dense(64, activation='relu'))
+# Add another:
+model.add(keras.layers.Dense(64, activation='relu'))
+# Add a softmax layer with 10 output units:
+model.add(keras.layers.Dense(10, activation='softmax'))
 ```
 
----
-
-## Configuring layers
-
-Each layer may have unique constructor arguments, but some common arguments include:
+### Configure the layers
 
-- `activation`: the activation function to be used.
-    It could be specified by name, as a string (for built-in functions)
-    or as a callable object. By default, no activation is applied.
-- `kernel_initializer` and `bias_initializer`: the initialization schemes to use
-    to create the layer's weights (kernel and bias).
-    Likewise, they may be passed either by name or by specifying a callable.
-    By default, the "Glorot uniform" initializer is used.
-- `kernel_regularizer` and `bias_regularizer`: the regularization schemes to
-    apply to the layer's weights (kernel and bias), such as L1
-    or L2 regularization. By default, no regularization is applied.
+There are many `tf.keras.layers` available with some common constructor
+parameters:
 
+* `activation`: Set the activation function for the layer. This parameter is
+  specified by the name of a built-in function or as a callable object. By
+  default, no activation is applied.
+* `kernel_initializer` and `bias_initializer`: The initialization schemes
+  that create the layer's weights (kernel and bias). This parameter is a name or
+  a callable object. This defaults to the `"Glorot uniform"` initializer.
+* `kernel_regularizer` and `bias_regularizer`: The regularization schemes
+  that apply the layer's weights (kernel and bias), such as L1 or L2
+  regularization. By default, no regularization is applied.
 
-### Examples
+The following instantiates `tf.keras.layers.Dense` layers using constructor
+arguments:
 
 ```python
-import tensorflow as tf
-from tensorflow.keras.layers import Dense
-from tensorflow.keras import regularizers
-from tensorflow.keras import initializers
-
-# A sigmoid layer:
-Dense(64, activation='sigmoid')
-# Another way to define the same sigmoid layer:
-Dense(64, activation=tf.sigmoid)
-
-# A linear layer with L1 regularization of factor 0.01
-# applied to the kernel matrix:
-Dense(64, kernel_regularizer=regularizers.l1(0.01))
-# A linear layer with L2 regularization of factor 0.01
-# applied to the bias vector:
-Dense(64, bias_regularizer=regularizers.l2(0.01))
+# Create a sigmoid layer:
+layers.Dense(64, activation='sigmoid')
+# Or:
+layers.Dense(64, activation=tf.sigmoid)
+
+# A linear layer with L1 regularization of factor 0.01 applied to the kernel matrix:
+layers.Dense(64, kernel_regularizer=keras.regularizers.l1(0.01))
+# A linear layer with L2 regularization of factor 0.01 applied to the bias vector:
+layers.Dense(64, bias_regularizer=keras.regularizers.l2(0.01))
 
 # A linear layer with a kernel initialized to a random orthogonal matrix:
-Dense(64, kernel_initializer='orthogonal')
+layers.Dense(64, kernel_initializer='orthogonal')
 # A linear layer with a bias vector initialized to 2.0s:
-Dense(64, bias_initializer=initializers.constant(2.0))
+layers.Dense(64, bias_initializer=keras.initializers.constant(2.0))
 ```
 
----
+## Train and evaluate
 
-## Configuring training
+### Set up training
 
-Once your model looks good, configure its learning process by calling `compile`:
+After the model is constructed, configure its learning process by calling the
+`compile` method:
 
 ```python
-import tensorflow as tf
-
 model.compile(optimizer=tf.train.AdamOptimizer(0.001),
               loss='categorical_crossentropy',
               metrics=['accuracy'])
 ```
 
-There are three key arguments that you need to specify:
+`tf.keras.Model.compile` takes three important arguments:
 
-- An `optimizer`: this object specifies the training procedure.
-    We recommend that you pass instances of optimizers from the `tf.train` module
-    (such as [`AdamOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/AdamOptimizer),
-    [`RMSPropOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer),
-    or [`GradientDescentOptimizer`](https://www.tensorflow.org/api_docs/python/tf/train/GradientDescentOptimizer)).
-- A `loss` function to minimize: this specifies the optimization objective.
-    Common choices include mean square error (`mse`), `categorical_crossentropy`
-    and `binary_crossentropy`. Loss functions may be specified by name
-    or by passing a callable (e.g. from the `tf.keras.losses` module).
-- Some `metrics` to monitor during training: again, you can pass these as either
-    string names or callables (e.g. from the `tf.keras.metrics` module).
+* `optimizer`: This object specifies the training procedure. Pass it optimizer
+  instances from the `tf.train` module, such as
+  [`AdamOptimizer`](/api_docs/python/tf/train/AdamOptimizer),
+  [`RMSPropOptimizer`](/api_docs/python/tf/train/RMSPropOptimizer), or
+  [`GradientDescentOptimizer`](/api_docs/python/tf/train/GradientDescentOptimizer).
+* `loss`: The function to minimize during optimization. Common choices include
+  mean square error (`mse`), `categorical_crossentropy`, and
+  `binary_crossentropy`. Loss functions are specified by name or by
+  passing a callable object from the `tf.keras.losses` module.
+* `metrics`: Used to monitor training. These are string names or callables from
+  the `tf.keras.metrics` module.
 
-
-### Examples
+The following shows a few examples of configuring a model for training:
 
 ```python
-# Configures a model to do mean-squared error regression.
+# Configure a model for mean-squared error regression.
 model.compile(optimizer=tf.train.AdamOptimizer(0.01),
-              loss='mse',  # mean squared error
+              loss='mse',       # mean squared error
               metrics=['mae'])  # mean absolute error
-```
-```python
-# Configures a model to do categorical classification.
+
+# Configure a model for categorical classification.
 model.compile(optimizer=tf.train.RMSPropOptimizer(0.01),
-              loss=tf.keras.losses.categorical_crossentropy,
-              metrics=[tf.keras.metrics.categorical_accuracy])
+              loss=keras.losses.categorical_crossentropy,
+              metrics=[keras.metrics.categorical_accuracy])
 ```
 
----
-
-## Training and evaluation
+### Input NumPy data
 
-### From Numpy data
-
-When running locally on small datasets, the easiest way to do training and
-evaluation is to pass data to your model as Numpy arrays of inputs and targets.
-You can "fit" your model to some training data using the `model.fit()` method:
+For small datasets, use in-memory [NumPy](https://www.numpy.org/){:.external}
+arrays to train and evaluate a model. The model is "fit" to the training data
+using the `fit` method:
 
 ```python
 import numpy as np
 
-data = np.random.random(shape=(1000, 32))
-targets = np.random.random(shape=(1000, 10))
+data = np.random.random((1000, 32))
+labels = np.random.random((1000, 10))
 
-model.fit(data, targets, epochs=10, batch_size=32)
+model.fit(data, labels, epochs=10, batch_size=32)
 ```
 
-Here are some key arguments you can pass to the `fit` method:
-
-- `epochs`: Training is structured into **epochs**. An epoch is one iteration
-    over the entire input data (which is done in smaller batches).
-- `batch_size`: when passing Numpy data, the model will slice the data into
-    smaller batches and iterate over these batches during training.
-    This integer specifies the size of each batch
-    (the last batch may be smaller if the total number of samples is not
-    divisible by the batch size).
-- `validation_data`: when prototyping a model, you want to be able to quickly
-    monitor its performance on some validation data.
-    When you pass this argument (it expects a tuple of inputs and targets),
-    the model will display the loss and metrics in inference mode on the data
-    you passed, at the end of each epoch.
+`tf.keras.Model.fit` takes three important arguments:
+
+* `epochs`: Training is structured into *epochs*. An epoch is one iteration over
+  the entire input data (this is done in smaller batches).
+* `batch_size`: When passed NumPy data, the model slices the data into smaller
+  batches and iterates over these batches during training. This integer
+  specifies the size of each batch. Be aware that the last batch may be smaller
+  if the total number of samples is not divisible by the batch size.
+* `validation_data`: When prototyping a model, you want to easily monitor its
+  performance on some validation data. Passing this argument—a tuple of inputs
+  and labels—allows the model to display the loss and metrics in inference mode
+  for the passed data, at the end of each epoch.
 
 Here's an example using `validation_data`:
 
 ```python
 import numpy as np
 
-data = np.random.random(shape=(1000, 32))
-targets = np.random.random(shape=(1000, 10))
+data = np.random.random((1000, 32))
+labels = np.random.random((1000, 10))
 
-val_data = np.random.random(shape=(100, 32))
-val_targets = np.random.random(shape=(100, 10))
+val_data = np.random.random((100, 32))
+val_labels = np.random.random((100, 10))
 
-model.fit(data, targets, epochs=10, batch_size=32,
-          validation_data=(val_data, val_targets))
+model.fit(data, labels, epochs=10, batch_size=32,
+          validation_data=(val_data, val_labels))
 ```
 
-### From tf.data datasets
+### Input tf.data datasets
 
-When you need to scale to large datasets or multi-device training,
-training from Numpy arrays in memory will not be ideal.
-In such cases, you should use [the `tf.data` API](https://www.tensorflow.org/programmers_guide/datasets).
-You can pass a `tf.data.Dataset` instance to the `fit` method:
+Use the [Datasets API](/programmers_guide/datasets) to scale to large datasets
+or multi-device training. Pass a `tf.data.Dataset` instance to the `fit`
+method:
 
 ```python
-import tensorflow as tf
-
 # Instantiates a toy dataset instance:
-dataset = tf.data.Dataset.from_tensor_slices((data, targets)).batch(32)
+dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+dataset = dataset.batch(32)
+dataset = dataset.repeat()
 
 # Don't forget to specify `steps_per_epoch` when calling `fit` on a dataset.
 model.fit(dataset, epochs=10, steps_per_epoch=30)
 ```
 
-When doing so, the dataset itself will yield batches of data,
-so the model does not need to be passed `batch_size` information.
-Instead, the model needs to know for how many steps (or batches of data)
-it should run at each epoch.
-You specify this with the `steps_per_epoch` argument: it's the number of
-training steps the model will run before moving on the next epoch.
+Here, the `fit` method uses the `steps_per_epoch` argument—this is the number of
+training steps the model runs before it moves to the next epoch. Since the
+`Dataset` yields batches of data, this snippet does not require a `batch_size`.
 
-You can also pass datasets for validation:
+Datasets can also be used for validation:
 
 ```python
-dataset = tf.data.Dataset.from_tensor_slices((data, targets)).batch(32)
-val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_targets)).batch(32)
+dataset = tf.data.Dataset.from_tensor_slices((data, labels))
+dataset = dataset.batch(32).repeat()
 
-model.fit(dataset, epochs=10, steps_per_epoch=30, validation_data=val_dataset, validation_steps=3)
+val_dataset = tf.data.Dataset.from_tensor_slices((val_data, val_labels))
+val_dataset = val_dataset.batch(32).repeat()
+
+model.fit(dataset, epochs=10, steps_per_epoch=30,
+          validation_data=val_dataset,
+          validation_steps=3)
 ```
 
 ### Evaluate and predict
 
-In addition, you get access to the following methods
-(both with Numpy data and dataset instances):
+The `tf.keras.Model.evaluate` and `tf.keras.Model.predict` methods can use NumPy
+data and a `tf.data.Dataset`.
 
-- `model.evaluate(x, y, batch_size=32)` or `model.evaluate(dataset, steps=30)`
-    will return the inference-mode loss and metrics for the data provided.
-- `model.predict(x, y, batch_size=32)` or `model.predict(dataset, steps=30)`
-    will return the output(s) of the last layer(s) in inference on the data
-    provided, as Numpy array(s).
+To *evaluate* the inference-mode loss and metrics for the data provided:
 
----
+```python
+model.evaluate(x, y, batch_size=32)
 
-## Building advanced models: the functional API
+model.evaluate(dataset, steps=30
+```
 
-The `Sequential` model cannot represent arbitrary models -- only simple stacks
-of layers. If you need to use more complex model topologies,
-such as multi-input models, multi-output models,
-models with a same layer called several times (shared layers),
-or models with non-sequential data flows (e.g. residual connections),
-you can use the 'functional API'.
+And to *predict* the output of the last layer in inference for the data provided,
+as a NumPy array:
 
-Here's how it works:
+```
+model.predict(x, batch_size=32)
 
-- A layer instance is callable (on a tensor), and it returns a tensor.
-- Input tensor(s) and output tensor(s) can then be used to define a `Model` instance.
-- Such a model can be trained just like the `Sequential` model.
+model.predict(dataset, steps=30)
+```
 
-Here's a basic example showing the same model we previously defined,
-built using the functional API:
 
+## Build advanced models
 
-```python
-from tensorflow import keras
-from tensorflow.keras import layers
+### Functional API
 
-# This returns a placeholder tensor:
-inputs = keras.Input(shape=(784,))
+The `tf.keras.Sequential` model is a simple stack of layers that cannot
+represent arbitrary models. Use the
+[Keras functional API](https://keras.io/getting-started/functional-api-guide/){:.external}
+to build complex model topologies such as:
+
+* Multi-input models,
+* Multi-output models,
+* Models with shared layers (the same layer called several times),
+* Models with non-sequential data flows (e.g. residual connections).
+
+Building a model with the functional API works like this:
+
+1. A layer instance is callable and returns a tensor.
+2. Input tensors and output tensors are used to define a `tf.keras.Model`
+   instance.
+3. This model is trained just like the `Sequential` model.
+
+The following example uses the functional API to build a simple, fully-connected
+network:
+
+```python
+inputs = keras.Input(shape=(32,))  # Returns a placeholder tensor
 
 # A layer instance is callable on a tensor, and returns a tensor.
-x = layers.Dense(64, activation='relu')(inputs)
-x = layers.Dense(64, activation='relu')(x)
-predictions = layers.Dense(10, activation='softmax')(x)
+x = keras.layers.Dense(64, activation='relu')(inputs)
+x = keras.layers.Dense(64, activation='relu')(x)
+predictions = keras.layers.Dense(10, activation='softmax')(x)
 
-# Instantiates the model given inputs and outputs.
+# Instantiate the model given inputs and outputs.
 model = keras.Model(inputs=inputs, outputs=predictions)
 
-# The "compile" step specifies the training configuration.
-model.compile(optimizer='rmsprop',
+# The compile step specifies the training configuration.
+model.compile(optimizer=tf.train.RMSPropOptimizer(0.001),
               loss='categorical_crossentropy',
               metrics=['accuracy'])
 
-# Trains for 5 epochs.
+# Trains for 5 epochs
 model.fit(data, labels, batch_size=32, epochs=5)
 ```
 
-This API enables you to create models with multiple inputs and outputs,
-and to "share" layers across different inputs
-(i.e. to reuse a same instance multiple times).
-For examples of these use cases,
-please see [this guide to the functional API in Keras](https://keras.io/getting-started/functional-api-guide/).
+### Model subclassing
 
----
+Build a fully-customizable model by subclassing `tf.keras.Model` and defining
+your own forward pass. Create layers in the `__init__` method and set them as
+attributes of the class instance. Define the forward pass in the `call` method.
 
-## Building fully-customizable research models: the Model subclassing API
+Model subclassing is particularly useful when
+[eager execution](/programmers_guide/eager) is enabled since the forward pass
+can be written imperatively.
 
-Besides `Sequential` and the functional API, one last, more flexible way to
-define models is to directly subclass the `Model` class and define your own
-forward pass manually.
+Key Point: Use the right API for the job. While model subclassing offers
+flexibility, it comes at a cost of greater complexity and more opportunities for
+user errors. If possible, prefer the functional API.
 
-In this API, you instante layers in `__init__` and set them as attribute of the
-class instance. Then you specify the forward pass in `call`.
-This API is particularly valuable when using TensorFlow with [eager execution](https://www.tensorflow.org/programmers_guide/eager),
-since eager execution allows you to write your forward pass in an
-imperative fashion (as if you were writing Numpy code, for instance).
+The following example shows a subclassed `tf.keras.Model` using a custom forward
+pass:
 
 ```python
-import tensorflow as tf
-from tensorflow import keras
-
-
 class MyModel(keras.Model):
 
-  def __init__(self, num_classes=2):
+  def __init__(self, num_classes=10):
     super(MyModel, self).__init__(name='my_model')
     self.num_classes = num_classes
     # Define your layers here.
@@ -351,10 +321,10 @@ class MyModel(keras.Model):
 
 
 # Instantiates the subclassed model.
-model = MyModel(num_classes=2)
+model = MyModel(num_classes=10)
 
-# The "compile" step specifies the training configuration.
-model.compile(optimizer='rmsprop',
+# The compile step specifies the training configuration.
+model.compile(optimizer=tf.train.RMSPropOptimizer(0.001),
               loss='categorical_crossentropy',
               metrics=['accuracy'])
 
@@ -362,353 +332,291 @@ model.compile(optimizer='rmsprop',
 model.fit(data, labels, batch_size=32, epochs=5)
 ```
 
-**Remember:** use the right API for the right job.
-Using the `Model` subclassing API offers more flexibility,
-but at the cost of greater complexity and a larger potential user error surface.
-Prefer using the functional API when possible.
 
----
+### Custom layers
 
-## Callbacks
+Create a custom layer by subclassing `tf.keras.layers.Layer` and implementing
+the following methods:
 
-Callbacks are objects that you can pass to your model that customize and extend
-its behavior during training.
-There are callbacks for saving checkpoints of your model at regular intervals
-(`tf.keras.callbacks.ModelCheckpoint`),
-to dynamically change the learning rate (`tf.keras.callbacks.LearningRateScheduler`)
-or to interrupt training when validation performance has stopped improving
-(`tf.keras.callbacks.EarlyStopping`).
-You can also use a callback to monitor your model's behavior using
-[TensorBoard](https://www.tensorflow.org/programmers_guide/summaries_and_tensorboard)
-(`tf.keras.callbacks.TensorBoard`).
-You can also write your own custom callbacks.
-
-Different built-in callback are found in `tf.keras.callbacks`.
-You use them by passing a `Callback` instance to `fit`:
+* `build`: Create the weights of the layer. Add weights with the `add_weight`
+  method.
+* `call`: Define the forward pass.
+* `compute_output_shape`: Specify how to compute the output shape of the layer
+  given the input shape.
+* Optionally, a layer can be serialized by implementing the `get_config` method
+  and the `from_config` class method.
+
+Here's an example of a custom layer that implements a `matmul` of an input with
+a kernel matrix:
 
 ```python
-from tensorflow import keras
+class MyLayer(keras.layers.Layer):
+
+  def __init__(self, output_dim, **kwargs):
+    self.output_dim = output_dim
+    super(MyLayer, self).__init__(**kwargs)
+
+  def build(self, input_shape):
+    shape = tf.TensorShape((input_shape[1], self.output_dim))
+    # Create a trainable weight variable for this layer.
+    self.kernel = self.add_weight(name='kernel',
+                                  shape=shape,
+                                  initializer='uniform',
+                                  trainable=True)
+    # Be sure to call this at the end
+    super(MyLayer, self).build(input_shape)
 
-callbacks = [
-    # Interrupt training if `val_loss` stops improving for over 2 epochs
-    keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
-    # Write TensorBoard logs to `./logs` directory
-    keras.callbacks.TensorBoard(log_dir='./logs')
-]
-model.fit(data, labels, batch_size=32, epochs=5, callbacks=callbacks)
-```
+  def call(self, inputs):
+    return tf.matmul(inputs, self.kernel)
 
----
+  def compute_output_shape(self, input_shape):
+    shape = tf.TensorShape(input_shape).as_list()
+    shape[-1] = self.output_dim
+    return tf.TensorShape(shape)
 
-## Saving and serialization
+  def get_config(self):
+    base_config = super(MyLayer, self).get_config()
+    base_config['output_dim'] = self.output_dim
 
-### Weights-only saving
+  @classmethod
+  def from_config(cls, config):
+    return cls(**config)
 
-You can save the weight values of a model via `model.save_weights(filepath)`:
 
-```python
-# Saves weights to a SavedModel file.
-model.save_weights('my_model')
+# Create a model using the custom layer
+model = keras.Sequential([MyLayer(10),
+                          keras.layers.Activation('softmax')])
 
-# Restores the model's state
-# (this requires a model that has the same architecture).
-model.load_weights('my_model')
+# The compile step specifies the training configuration
+model.compile(optimizer=tf.train.RMSPropOptimizer(0.001),
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+
+# Trains for 5 epochs.
+model.fit(data, targets, batch_size=32, epochs=5)
 ```
 
-By default, this saves the weight in the TensorFlow
-[`SavedModel`](https://www.tensorflow.org/programmers_guide/saved_model) format.
-You could also save them in the Keras HDF5 format
-(which is the default in the multi-backend implementation of Keras):
 
-```python
-# Saves weights to a HDF5 file.
-model.save_weights('my_model.h5', format='h5')
+## Callbacks
 
-# Restores the model's state.
-model.load_weights('my_model.h5')
-```
+A callback is an object passed to a model to customize and extend its behavior
+during training. You can write your own custom callback, or use the built-in
+`tf.keras.callbacks` that include:
 
-### Configuration-only saving (serialization)
+* `tf.keras.callbacks.ModelCheckpoint`: Save checkpoints of your model at
+  regular intervals.
+* `tf.keras.callbacks.LearningRateScheduler`: Dynamically change the learning
+  rate.
+* `tf.keras.callbacks.EarlyStopping`: Interrupt training when validation
+  performance has stopped improving.
+* `tf.keras.callbacks.TensorBoard`: Monitor the model's behavior using
+  [TensorBoard](/programmers_guide/summaries_and_tensorboard).
 
-You can also save the model's configuration
-(its architecture, without any weight values),
-which allows you to recreate the same model later (freshly initialized) even if
-you don't have the code that defined it anymore.
-Two possible serialization formats are JSON and YAML:
+To use a `tf.keras.callbacks.Callback`, pass it to the model's `fit` method:
 
 ```python
-from tensorflow.keras import models
-
-# Serializes a model to JSON.
-json_string = model.to_json()
-# Recreates the model (freshly initialized).
-fresh_model = models.from_json(json_string)
-
-# Serializes a model to YAML.
-yaml_string = model.to_yaml()
-# Recreates the model.
-fresh_model = models.from_yaml(yaml_string)
+callbacks = [
+  # Interrupt training if `val_loss` stops improving for over 2 epochs
+  keras.callbacks.EarlyStopping(patience=2, monitor='val_loss'),
+  # Write TensorBoard logs to `./logs` directory
+  keras.callbacks.TensorBoard(log_dir='./logs')
+]
+model.fit(data, labels, batch_size=32, epochs=5, callbacks=callbacks,
+          validation_data=(val_data, val_targets))
 ```
 
-Note that this feature is not available with subclassed models,
-because they are simply not serializable:
-their architecture is defined as Python code
-(the body of the `call` method of the model).
 
-### Whole-model saving
+## Save and restore
 
-Finally, you can also save a model wholesale, to a file that will contain both
-the weight values, the model's configuration,
-and even the optimizer's configuration.
-The allows you to checkpoint a model and resume training later --
-from the exact same state -- even if you don't have access to the original code.
+### Weights only
 
-```python
-from tensorflow.keras import models
+Save and load the weights of a model using `tf.keras.Model.save_weights`:
 
-model.save('my_model.h5')
+```python
+# Save weights to a TensorFlow Checkpoint file
+model.save_weights('./my_model')
 
-# Recreates the exact same model, complete with weights and optimizer.
-model = models.load_model('my_model.h5')
+# Restore the model's state,
+# this requires a model with the same architecture.
+model.load_weights('my_model')
 ```
 
----
-
-## Developing custom layers
-
-You can write your own custom layers by subclassing the class
-`tf.keras.layers.Layer`. You will need to implement the following three methods:
-
-- `build`: Creates the weights of the layer.
-    Weights should be added via the `add_weight` method.
-- `call`: Specifies the forward pass.
-- `compute_output_shape`: Specifies how to compute the output shape of the layer 
-    given the input shape.
-
-Optionally, you may also implement the method `get_config()` and the
-class method `from_config()` if you want your layer to be serializable.
-
-Here's a simple example of a custom layer that implements a `matmul`
-of an input with a kernel matrix:
+By default, this saves the model's weights in the
+[TensorFlow checkpoint](/get_started/checkpoints) file format. Weights can also
+be saved to the Keras HDF5 format (the default for the multi-backend
+implementation of Keras):
 
 ```python
-import tensorflow as tf
-from tensorflow.keras import layers
-
-class MyLayer(layers.Layer):
-
-    def __init__(self, output_dim, **kwargs):
-        self.output_dim = output_dim
-        super(MyLayer, self).__init__(**kwargs)
-
-    def build(self, input_shape):
-        # Create a trainable weight variable for this layer.
-        self.kernel = self.add_weight(name='kernel', 
-                                      shape=(input_shape[1], self.output_dim),
-                                      initializer='uniform',
-                                      trainable=True)
-        # Be sure to call this at the end
-        super(MyLayer, self).build(input_shape)
-
-    def call(self, inputs):
-        return tf.matmul(inputs, self.kernel)
-
-    def compute_output_shape(self, input_shape):
-        shape = tf.TensorShape(input_shape).as_list()
-        shape[-1] = self.output_dim
-        return tf.TensorShape(shape)
-
-    def get_config(self):
-        base_config = super(MyLayer, self).get_config()
-        base_config['output_dim'] = self.output_dim
-
-    @classmethod
-    def from_config(cls, config):
-        return cls(**config)
-```
+# Save weights to a HDF5 file
+model.save_weights('my_model.h5', save_format='h5')
 
----
-
-## Eager execution
+# Restore the model's state
+model.load_weights('my_model.h5')
+```
 
-[Eager execution](https://www.tensorflow.org/programmers_guide/eager)
-is a way to write TensorFlow code imperatively.
 
-All three `tf.keras` model-building APIs
-(`Sequential`, the functional API `Model(inputs, outputs)`,
-and the subclassing API `MyModel(Model)`) are compatible with eager execution.
-When using `Sequential` or the functional API, it makes no difference to the
-user experience whether the model is executing eagerly or not.
-Eager execution is most beneficial when used with the `Model` subclassing API,
-or when prototyping a custom layer -- that is to say, in APIs that require you
-to *write a forward pass as code*, rather than in APIs that allow you to create
-models by assembling together existing layers.
+### Configuration only
 
-While the same training and evaluating APIs presented in this guide work
-as usual with eager execution, you can in addition
-write custom training loops using the eager `GradientTape`
-and define-by-run autodifferentiation:
+A model's configuration can be saved—this serializes the model architecture
+without any weights. A saved configuration can recreate and initialize the same
+model, even without the code that defined the original model. Keras supports
+JSON and YAML serialization formats:
 
 ```python
-import tensorflow as tf
-from tensorflow.contrib import eager as tfe
-
-# This call begins the eager execution session.
-tf.enable_eager_execution()
-
-model = ...  # Defines a Keras model (we recommend Model subclassing in this case).
-dataset = ...  # Defines a `tf.data` dataset.
+# Serialize a model to JSON format
+json_string = model.to_json()
 
-optimizer = tf.train.AdamOptimizer(0.01)
+# Recreate the model (freshly initialized)
+fresh_model = keras.models.from_json(json_string)
 
-for data, labels in dataset:
-    # Runs the forward pass and loss computation under a `GradientTape` scope,
-    # which will record all operations in order to prepare for the backward pass.
-    with tfe.GradientTape() as tape:
-      predictions = model(data)
-      loss = loss_function(labels, predictions)
+# Serializes a model to YAML format
+yaml_string = model.to_yaml()
 
-    # Runs the backward pass manually using the operations recorded
-    # by the gradient tape.
-    grads = tape.gradient(loss, model.trainable_weights)
-    optimizer.apply_gradients(zip(grads, model.trainable_weights),
-                              global_step=tf.train.get_or_create_global_step())
+# Recreate the model
+fresh_model = keras.models.from_yaml(yaml_string)
 ```
 
----
+Caution: Subclassed models are not serializable because their architecture is
+defined by the Python code in the body of the `call` method.
 
-## Further reading
 
-### Documentation
+### Entire model
 
-- [tf.keras documentation](https://www.tensorflow.org/api_docs/python/tf/keras)
-- [keras.io](https://keras.io/)
+The entire model can be saved to a file that contains the weight values, the
+model's configuration, and even the optimizer's configuration. This allows you
+to checkpoint a model and resume training later—from the exact same
+state—without access to the original code.
 
-### tf.keras tutorials and examples
-
-- [Fashion-MNIST with tf.Keras](https://medium.com/tensorflow/hello-deep-learning-fashion-mnist-with-keras-50fcff8cd74a)
-- [Predicting the price of wine with the Keras Functional API and TensorFlow](
-    https://medium.com/tensorflow/predicting-the-price-of-wine-with-the-keras-functional-api-and-tensorflow-a95d1c2c1b03)
+```python
+# Create a trivial model
+model = keras.Sequential([
+  keras.layers.Dense(10, activation='softmax', input_shape=(32,)),
+  keras.layers.Dense(10, activation='softmax')
+])
+model.compile(optimizer='rmsprop',
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+model.fit(data, targets, batch_size=32, epochs=5)
 
 
----
+# Save entire model to a HDF5 file
+model.save('my_model.h5')
 
-## FAQ
+# Recreate the exact same model, including weights and optimizer.
+model = keras.models.load_model('my_model.h5')
+```
 
-### What are the differences between tf.keras and the multi-backend Keras implementation?
 
-`tf.keras` includes first-class support for important TensorFlow-specific
-functionality not found in other Keras implementations, in particular:
+## Eager execution
 
-- Support for eager execution.
-- Support for the `tf.data` API.
-- Integration with the
-    [`tf.estimator` API](https://www.tensorflow.org/programmers_guide/estimators),
-    via `tf.keras.estimator.model_to_estimator`.
+[Eager execution](/programmers_guide/eager) is an imperative programming
+environment that evaluates operations immediately. This is not required for
+Keras, but is supported by `tf.keras` and useful for inspecting your program and
+debugging.
 
-In terms of API differences: `tf.keras` is a full implementation of the
-Keras API, so any code targeting the Keras API will run on `tf.keras`.
-However, keep in mind that:
+All of the `tf.keras` model-building APIs are compatible with eager execution.
+And while the `Sequential` and functional APIs can be used, eager execution
+especially benefits *model subclassing* and building *custom layers*—the APIs
+that require you to write the forward pass as code (instead of the APIs that
+create models by assembling existing layers).
 
-- The `tf.keras` API version in the latest TensorFlow release might not be the
-    same as the latest `keras` version from PyPI.
-    Check out `tf.keras.__version__` if in doubt.
-- In `tf.keras`, the default file format saved by `model.save_weights` is the
-    TensorFlow `SavedModel` format.
-    To use HDF5, you can pass the `format='h5'` argument.
+See the [eager execution guide](/programmers_guide/eager#build_a_model) for
+examples of using Keras models with custom training loops and `tf.GradientTape`.
 
 
-### What is the relationship between tf.keras and tf.estimator?
+## Distribution
 
-The [`tf.estimator` API](https://www.tensorflow.org/programmers_guide/estimators)
-is a high-level TensorFlow API for training "estimator" models,
-in particular in distributed settings.
-This API targets industry use cases, such as distributed training
-on large datasets with a focus on eventually exporting a production model.
+### Estimators
 
-If you have a `tf.keras` model that would like to train with the `tf.estimator`
-API, you can convert your model to an `Estimator` object via the
-`model_to_estimator` utility](https://www.tensorflow.org/programmers_guide/estimators#creating_estimators_from_keras_models):
+The [Estimators](/programmers_guide/estimators) API is used for training models
+for distributed environments. This targets industry use cases such as
+distributed training on large datasets that can export a model for production.
 
+A `tf.keras.Model` can be trained with the `tf.estimator` API by converting the
+model to an `tf.estimator.Estimator` object with
+`tf.keras.estimator.model_to_estimator`. See
+[Creating Estimators from Keras models](/programmers_guide/estimators#creating_estimators_from_keras_models).
 
 ```python
-estimator = tf.keras.estimator.model_to_estimator(model)
-```
+model = keras.Sequential([layers.Dense(10,activation='softmax'),
+                          layers.Dense(10,activation='softmax')])
 
-When using `model_to_estimator`, enabling eager execution is helpful for
-developing and debugging your `input_fn`
-(as it allows you to easily print your data).
+model.compile(optimizer=tf.train.RMSPropOptimizer(0.001),
+              loss='categorical_crossentropy',
+              metrics=['accuracy'])
+
+estimator = keras.estimator.model_to_estimator(model)
+```
 
+Note: Enable [eager execution](/programmers_guide/eager) for debugging
+[Estimator input functions](/programmers_guide/premade_estimators#create_input_functions)
+and inspecting data.
 
-### How can I run tf.keras models on multiple GPUs?
+### Multiple GPUs
 
-You can run tf.keras models on multiple GPUs using the
-[`DistributionStrategy API`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/DistributionStrategy).
-The `DistributionStrategy` API allow you to distribute training on multiple GPUs
-with almost no changes to your existing code.
+`tf.keras` models can run on multiple GPUs using
+`tf.contrib.distribute.DistributionStrategy`. This API provides distributed
+training on multiple GPUs with almost no changes to existing code.
 
-Currently [`MirroredStrategy`](https://www.tensorflow.org/versions/master/api_docs/python/tf/contrib/distribute/MirroredStrategy)
-is the only supported strategy.
-`MirroredStrategy` allows you to do in-graph replication with synchronous
-training using all-reduce on a single machine.
-To use `DistributionStrategy` with a `tf.keras` model,
-you can use the `model_to_estimator` utility to convert a `tf.keras` model to
-an `Estimator` and then train the estimator.
+Currently, `tf.contrib.distribute.MirroredStrategy` is the only supported
+distribution strategy. `MirroredStrategy` does in-graph replication with
+synchronous training using all-reduce on a single machine. To use
+`DistributionStrategy` with Keras, convert the `tf.keras.Model` to a
+`tf.estimator.Estimator` with `tf.keras.estimator.model_to_estimator`, then
+train the estimator
 
-Here is a simple example of distributing a `tf.keras` model across multiple GPUs
-on a single machine.
+The following example distributes a `tf.keras.Model` across multiple GPUs on a
+single machine.
 
-Let's first define a simple model:
+First, define a simple model:
 
 ```python
-model = tf.keras.Sequential()
-model.add(tf.keras.layers.Dense(16, activation='relu', input_shape=(10,)))
-model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
+model = keras.Sequential()
+model.add(keras.layers.Dense(16, activation='relu', input_shape=(10,)))
+model.add(keras.layers.Dense(1, activation='sigmoid'))
+
 optimizer = tf.train.GradientDescentOptimizer(0.2)
+
 model.compile(loss='binary_crossentropy', optimizer=optimizer)
 model.summary()
 ```
 
-Let's use `model_to_estimator` to create an `Estimator` instance from the
-`tf.keras` model defined above.
+Convert the Keras model to a `tf.estimator.Estimator` instance:
 
 ```python
-keras_estimator = tf.keras.estimator.model_to_estimator(
-    keras_model=model,
-    config=config,
-    model_dir='/tmp/model_dir')
+keras_estimator = keras.estimator.model_to_estimator(
+  keras_model=model,
+  config=config,
+  model_dir='/tmp/model_dir')
 ```
 
-We'll use `tf.data.Datasets` to define our input pipeline.
-Our `input_fn` returns a `tf.data.Dataset` object that we then use to distribute
-the data across multiple devices with each device processing
+Define an *input pipeline*. The `input_fn` returns a `tf.data.Dataset` object
+used to distribute the data across multiple devices—with each device processing
 a slice of the input batch.
 
 ```python
 def input_fn():
-    x = np.random.random((1024, 10))
-    y = np.random.randint(2, size=(1024, 1))
-    x = tf.cast(x, tf.float32)
-    dataset = tf.data.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(10)
-    dataset = dataset.batch(32)
-    return dataset
+  x = np.random.random((1024, 10))
+  y = np.random.randint(2, size=(1024, 1))
+  x = tf.cast(x, tf.float32)
+  dataset = tf.data.Dataset.from_tensor_slices((x, y))
+  dataset = dataset.repeat(10)
+  dataset = dataset.batch(32)
+  return dataset
 ```
 
-The next step is to create a `RunConfig` and set the train_distribute argument
-to the new `MirroredStrategy` instance.
-You can specify a list of devices or the `num_gpus` argument when creating
-a `MirroredStrategy` instance.
-Not specifying any arguments defaults to using all the available GPUs like we do
-in this example.
+Next, create a `tf.estimator.RunConfig` and set the `train_distribute` argument
+to the `tf.contrib.distribute.MirroredStrategy` instance. When creating
+`MirroredStrategy`, you can specify a list of devices or set the `num_gpus`
+argument. The default uses all available GPUs, like the following:
 
 ```python
 strategy = tf.contrib.distribute.MirroredStrategy()
 config = tf.estimator.RunConfig(train_distribute=strategy)
 ```
 
-Call train on the `Estimator` instance providing the `input_fn` and `steps`
-arguments as input:
+Finally, train the `Estimator` instance by providing the `input_fn` and `steps`
+arguments:
 
 ```python
 keras_estimator.train(input_fn=input_fn, steps=10)
-- 
GitLab


From 6c7e526e74dc3a5ec74cb99395d68a445cb41dbd Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Fri, 8 Jun 2018 16:14:11 -0700
Subject: [PATCH 514/610] [XLA] Add flag to BatchNormExpander pass that lets it
 use explicit broadcasts instead of kMap instructions.

PiperOrigin-RevId: 199867000
---
 .../xla/service/batchnorm_expander.cc         | 111 +++++++++---------
 .../compiler/xla/service/batchnorm_expander.h |   7 +-
 .../compiler/xla/service/cpu/cpu_compiler.cc  |   2 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  |   2 +-
 4 files changed, 63 insertions(+), 59 deletions(-)

diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index 598718c72c..a9f4aead59 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -59,7 +59,7 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation, bool rewrite_training_op,
                   bool rewrite_inference_op, bool rewrite_grad_op,
-                  bool use_fusion);
+                  bool use_map_instructions);
 
   // Returns whether any batch norm ops were rewritten.
   const bool changed() const { return changed_; }
@@ -70,12 +70,13 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   explicit BatchNormExpanderVisitor(HloComputation* computation,
                                     bool rewrite_training_op,
                                     bool rewrite_inference_op,
-                                    bool rewrite_grad_op, bool use_fusion)
+                                    bool rewrite_grad_op,
+                                    bool use_map_instructions)
       : computation_(computation),
         rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op),
-        use_fusion_(use_fusion) {}
+        use_map_instructions_(use_map_instructions) {}
 
   HloComputation* GetOrCreateScalarAddComputation(
       PrimitiveType primitive_type) {
@@ -122,10 +123,24 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     return *scalar_rsqrt_computation;
   }
 
-  std::unique_ptr<HloInstruction> Rsqrt(HloInstruction* operand) {
-    return HloInstruction::CreateMap(
-        operand->shape(), {operand},
-        GetOrCreateScalarRsqrtComputation(operand->shape().element_type()));
+  std::unique_ptr<HloInstruction> Rsqrt(
+      HloInstruction* operand,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          add_instruction) {
+    if (use_map_instructions_) {
+      return HloInstruction::CreateMap(
+          operand->shape(), {operand},
+          GetOrCreateScalarRsqrtComputation(operand->shape().element_type()));
+    }
+    HloInstruction* exponent = add_instruction(HloInstruction::CreateBroadcast(
+        operand->shape(),
+        add_instruction(HloInstruction::CreateConvert(
+            ShapeUtil::MakeShape(operand->shape().element_type(), {}),
+            add_instruction(HloInstruction::CreateConstant(
+                Literal::CreateR0<float>(-0.5f))))),
+        {}));
+    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kPower,
+                                        operand, exponent);
   }
 
   HloComputation* GetOrCreateScalarMeanComputation(PrimitiveType primitive_type,
@@ -152,12 +167,26 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
     return *scalar_mean_computation;
   }
 
-  std::unique_ptr<HloInstruction> Mean(int64 element_count,
-                                       HloInstruction* operand) {
-    return HloInstruction::CreateMap(
-        operand->shape(), {operand},
-        GetOrCreateScalarMeanComputation(operand->shape().element_type(),
-                                         element_count));
+  std::unique_ptr<HloInstruction> Mean(
+      int64 element_count, HloInstruction* operand,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          add_instruction) {
+    if (use_map_instructions_) {
+      return HloInstruction::CreateMap(
+          operand->shape(), {operand},
+          GetOrCreateScalarMeanComputation(operand->shape().element_type(),
+                                           element_count));
+    }
+    HloInstruction* elem_count_recip =
+        add_instruction(HloInstruction::CreateBroadcast(
+            operand->shape(),
+            add_instruction(HloInstruction::CreateConvert(
+                ShapeUtil::MakeShape(operand->shape().element_type(), {}),
+                add_instruction(HloInstruction::CreateConstant(
+                    Literal::CreateR0<float>(1.0 / element_count))))),
+            {}));
+    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kMultiply,
+                                        operand, elem_count_recip);
   }
 
   // Replaces the existing HLO instruction old_instruction, with
@@ -189,7 +218,7 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   bool rewrite_training_op_;
   bool rewrite_inference_op_;
   bool rewrite_grad_op_;
-  bool use_fusion_;
+  bool use_map_instructions_;
 
   // Whether rewrite has occurred.
   bool changed_ = false;
@@ -208,13 +237,14 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
 bool BatchNormExpanderVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
                                    bool rewrite_inference_op,
-                                   bool rewrite_grad_op, bool use_fusion) {
+                                   bool rewrite_grad_op,
+                                   bool use_map_instructions) {
   BatchNormExpanderVisitor visitor(
       computation,
       /*rewrite_training_op=*/rewrite_training_op,
       /*rewrite_inference_op=*/rewrite_inference_op,
       /*rewrite_grad_op=*/rewrite_grad_op,
-      /*use_fusion=*/use_fusion);
+      /*use_map_instructions=*/use_map_instructions);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -290,28 +320,14 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       feature_shape, operand_squared, zero, dimensions_without_feature,
       add_reduce_computation));
 
-  // Fuse two parallel reduces together to improve performance.
-  if (use_fusion_ && !batch_norm->has_sharding()) {
-    auto tuple = add(HloInstruction::CreateTuple({sum, squared_sum}));
-
-    auto fused = computation_->CreateFusionInstruction(
-        {tuple, sum, squared_sum, operand_squared},
-        HloInstruction::FusionKind::kInput);
-
-    sum = add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
-
-    squared_sum =
-        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
-  }
-
   // E[X].
-  auto mean = add(Mean(elements_per_feature_int64, sum));
+  auto mean = add(Mean(elements_per_feature_int64, sum, add));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum));
+  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum, add));
 
   // E^2[X].
   auto mean_square =
@@ -329,7 +345,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon);
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon));
+  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon, add));
 
   // X - E[X].
   auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract,
@@ -431,7 +447,7 @@ Status BatchNormExpanderVisitor::HandleBatchNormInference(
       add_binary(operand_shape, HloOpcode::kAdd, var_broadcasted, epsilon);
 
   // 1 / Sqrt[Var[X] + epsilon].
-  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon));
+  auto rsqrt_var_add_epsilon = add(Rsqrt(var_add_epsilon, add));
 
   // X - E[X].
   auto operand_minus_mean = add_binary(operand_shape, HloOpcode::kSubtract,
@@ -545,10 +561,12 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
   // rsqrt[Var[X] + epsilon].
   auto rsqrt_var_add_epsilon_broadcasted =
       add(Rsqrt(add_binary(activation_shape, HloOpcode::kAdd,
-                           variance_broadcasted, epsilon_activation)));
+                           variance_broadcasted, epsilon_activation),
+                add));
 
   auto rsqrt_var_add_epsilon = add(Rsqrt(
-      add_binary(feature_shape, HloOpcode::kAdd, variance, epsilon_feature)));
+      add_binary(feature_shape, HloOpcode::kAdd, variance, epsilon_feature),
+      add));
 
   // X - E[X].
   auto activation_minus_mean = add_binary(
@@ -573,21 +591,6 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       feature_shape, grad_output, zero, dimensions_without_feature,
       add_reduce_computation));
 
-  if (use_fusion_ && !batch_norm->has_sharding()) {
-    auto tuple = add(HloInstruction::CreateTuple(
-        {sum_grad_output_times_activiation_minus_mean, grad_beta}));
-
-    auto fused = computation_->CreateFusionInstruction(
-        {tuple, sum_grad_output_times_activiation_minus_mean, grad_beta},
-        HloInstruction::FusionKind::kInput);
-
-    sum_grad_output_times_activiation_minus_mean =
-        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 0));
-
-    grad_beta =
-        add(HloInstruction::CreateGetTupleElement(feature_shape, fused, 1));
-  }
-
   // Grad[scale] = Sum(Grad[Y] * (X - E[X]) * rsqrt[Var[X] + epsilon]).
   auto grad_scale = add_binary(feature_shape, HloOpcode::kMultiply,
                                sum_grad_output_times_activiation_minus_mean,
@@ -616,8 +619,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
                  rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon =
-      add(Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon));
+  scale_times_rsqrt_var_add_epsilon = add(
+      Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon, add));
 
   auto elements_per_feature_literal =
       Literal::CreateR0<float>(elements_per_feature_int64);
@@ -666,7 +669,7 @@ StatusOr<bool> BatchNormExpander::Run(HloModule* module) {
   for (auto* comp : module->MakeNonfusionComputations()) {
     if (BatchNormExpanderVisitor::Run(comp, rewrite_training_op_,
                                       rewrite_inference_op_, rewrite_grad_op_,
-                                      use_fusion_)) {
+                                      use_map_instructions_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
index 4ad987085d..8826636416 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -31,11 +31,12 @@ class BatchNormExpander : public HloPassInterface {
   // When use_fusion is set, a multi-output fusion node is created.
   BatchNormExpander(bool rewrite_training_op = false,
                     bool rewrite_inference_op = false,
-                    bool rewrite_grad_op = false, bool use_fusion = true)
+                    bool rewrite_grad_op = false,
+                    bool use_map_instructions = false)
       : rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
         rewrite_grad_op_(rewrite_grad_op),
-        use_fusion_(use_fusion) {}
+        use_map_instructions_(use_map_instructions) {}
   ~BatchNormExpander() = default;
   tensorflow::StringPiece name() const override { return "batchnorm_expander"; }
 
@@ -47,7 +48,7 @@ class BatchNormExpander : public HloPassInterface {
   bool rewrite_training_op_;
   bool rewrite_inference_op_;
   bool rewrite_grad_op_;
-  bool use_fusion_;
+  bool use_map_instructions_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 25b18eff20..d6b7b7d2d8 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -265,7 +265,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true,
-        /*use_fusion=*/false);
+        /*use_map_instructions=*/false);
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index c995736af9..cc33847c5c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -164,7 +164,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true,
-          /*use_fusion=*/false);
+          /*use_map_instructions=*/false);
 
       // Rewrite gather ops into smaller ones.
       pass.AddPass<GatherExpander>();
-- 
GitLab


From 00a4d11ac6d60f486b32c317ffddeae9a056cf38 Mon Sep 17 00:00:00 2001
From: Andrew Selle <aselle@google.com>
Date: Fri, 8 Jun 2018 16:32:32 -0700
Subject: [PATCH 515/610] Support reloading tflite models into toco IR.

PiperOrigin-RevId: 199869270
---
 tensorflow/contrib/lite/toco/tflite/import.cc | 27 ++++++++++++++++---
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/lite/toco/tflite/import.cc b/tensorflow/contrib/lite/toco/tflite/import.cc
index c0e7ab2ef5..1be7cf07a7 100644
--- a/tensorflow/contrib/lite/toco/tflite/import.cc
+++ b/tensorflow/contrib/lite/toco/tflite/import.cc
@@ -113,15 +113,34 @@ void ImportOperators(
                  << operators_table.size();
     }
     string opname = operators_table.at(index);
+
+    // Find and use the appropriate operator deserialization factory.
+    std::unique_ptr<Operator> new_op = nullptr;
     if (ops_by_name.count(opname) == 0) {
-      LOG(FATAL) << "Op '" << opname << "' not supported";
+      string effective_opname = "TENSORFLOW_UNSUPPORTED";
+      if (ops_by_name.count(effective_opname) == 0) {
+        LOG(FATAL) << "Internal logic error: TENSORFLOW_UNSUPPORTED not found.";
+      }
+      new_op = ops_by_name.at(effective_opname)
+                   ->Deserialize(input_op->builtin_options(),
+                                 input_op->custom_options());
+      if (TensorFlowUnsupportedOperator* unsupported_op =
+              dynamic_cast<TensorFlowUnsupportedOperator*>(new_op.get())) {
+        unsupported_op->tensorflow_op = opname;
+        // TODO(b/109932940): Remove this when quantized is removed.
+        // For now, we assume all ops are quantized.
+        unsupported_op->quantized = true;
+      } else {
+        LOG(FATAL) << "Expected a TensorFlowUnsupportedOperator";
+      }
+    } else {
+      new_op = ops_by_name.at(opname)->Deserialize(input_op->builtin_options(),
+                                                   input_op->custom_options());
     }
-
-    auto new_op = ops_by_name.at(opname)->Deserialize(
-        input_op->builtin_options(), input_op->custom_options());
     model->operators.emplace_back(new_op.release());
     auto* op = model->operators.back().get();
 
+    // Make sure all the inputs and outputs are hooked up.
     auto inputs = input_op->inputs();
     for (int i = 0; i < inputs->Length(); i++) {
       auto input_index = inputs->Get(i);
-- 
GitLab


From f5a1a38a831e9db5a822351f3a3b138ab1cb83b3 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 16:46:20 -0700
Subject: [PATCH 516/610] Created a ThreadPoolDevice wrapper to make each op
 run with the number of threads stored in NodeDef.

PiperOrigin-RevId: 199870879
---
 tensorflow/core/framework/device_base.h |  4 ++++
 tensorflow/core/framework/op_kernel.cc  | 16 ++++++++++++++++
 tensorflow/core/framework/op_kernel.h   |  8 +++++---
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/framework/device_base.h b/tensorflow/core/framework/device_base.h
index ec26d92a61..b59ced869d 100644
--- a/tensorflow/core/framework/device_base.h
+++ b/tensorflow/core/framework/device_base.h
@@ -186,6 +186,10 @@ class DeviceBase {
 
   virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
 
+  const bool has_eigen_cpu_device() const {
+    return (eigen_cpu_device_ != nullptr);
+  }
+
   virtual const Eigen::ThreadPoolDevice* eigen_cpu_device() {
     CHECK(eigen_cpu_device_ != nullptr);
     return eigen_cpu_device_;
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index ce213a63be..a0f449d64f 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#define EIGEN_USE_THREADS
 #include "tensorflow/core/framework/op_kernel.h"
 
 #include <unordered_map>
 #include <utility>
 #include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
@@ -40,6 +42,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
 
@@ -270,6 +273,19 @@ OpKernelContext::OpKernelContext(Params* params, int num_outputs)
   if (params_->record_tensor_accesses) {
     referenced_tensors_.Init();
   }
+  if (params->device->has_eigen_cpu_device()) {
+    int64 block_size = -1, output_size = -1, num_threads = 1;
+    const Eigen::ThreadPoolDevice* thread_pool =
+        params_->device->eigen_cpu_device();
+    AttrSlice attributes(op_kernel().def());
+    if (GetNodeAttr(attributes, "_block_size", &block_size) == Status::OK() &&
+        GetNodeAttr(attributes, "_output_size", &output_size) == Status::OK()) {
+      num_threads = std::min(Eigen::divup(output_size, block_size),
+                             static_cast<int64>(thread_pool->numThreads()));
+      eigen_cpu_device_ = MakeUnique<Eigen::ThreadPoolDevice>(
+          thread_pool->getPool(), num_threads);
+    }
+  }
 }
 
 OpKernelContext::~OpKernelContext() {
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 5ebe6976fd..d307078e63 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_FRAMEWORK_OP_KERNEL_H_
-#define TENSORFLOW_FRAMEWORK_OP_KERNEL_H_
+#ifndef TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
+#define TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
 
 #include <functional>
 
@@ -1004,6 +1004,7 @@ class OpKernelContext {
   // OpKernels can use these eigen devices to carry out their
   // numerical computation.
   const Eigen::ThreadPoolDevice& eigen_cpu_device() const {
+    if (eigen_cpu_device_ != nullptr) return *eigen_cpu_device_;
     return *device()->eigen_cpu_device();
   }
   const Eigen::GpuDevice& eigen_gpu_device() const {
@@ -1139,6 +1140,7 @@ class OpKernelContext {
   mutable mutex mu_;  // mutable so const accessors can acquire the lock
   gtl::InlinedVector<WrappedAllocator, 4> wrapped_allocators_ GUARDED_BY(mu_);
   gtl::InlinedVector<TensorValue, 4> outputs_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> eigen_cpu_device_;
 
   // Constructed only if <params->record_tensor_accesses>.
   ManualConstructor<UniqueTensorReferences> referenced_tensors_ GUARDED_BY(mu_);
@@ -1576,4 +1578,4 @@ inline void OpOutputList::set_ref(int i, mutex* mu, Tensor* tensor_for_ref) {
 
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_FRAMEWORK_OP_KERNEL_H_
+#endif  // TENSORFLOW_CORE_FRAMEWORK_OP_KERNEL_H_
-- 
GitLab


From 0210bd07e6b5a4bce072e13b8f7908f7bc5db951 Mon Sep 17 00:00:00 2001
From: Jiri Simsa <jsimsa@google.com>
Date: Fri, 8 Jun 2018 16:50:00 -0700
Subject: [PATCH 517/610] [tf.data] Adding `drop_remainder` argument to
 `tf.data.Dataset.batch()` and `tf.data.Dataset.padded_batch()`, deprecating
 tf.contrib.data.batch_and_drop_remainder()` and
 `tf.contrib.data.padded_batch_and_drop_remainder()`.

PiperOrigin-RevId: 199871303
---
 .../contrib/data/python/ops/batching.py       |   9 +
 .../base_api/api_def_BatchDatasetV2.pbtxt     |  18 ++
 .../api_def_PaddedBatchDatasetV2.pbtxt        |  35 +++
 .../optimizers/data/map_and_batch_fusion.cc   |   6 +-
 .../data/map_and_batch_fusion_test.cc         |  89 ++++++++
 .../core/kernels/data/batch_dataset_op.cc     |  46 +++-
 .../kernels/data/padded_batch_dataset_op.cc   |  49 ++++-
 tensorflow/core/ops/dataset_ops.cc            |  57 ++++-
 tensorflow/python/data/kernel_tests/BUILD     |   1 +
 .../kernel_tests/batch_dataset_op_test.py     | 205 +++++++++++-------
 tensorflow/python/data/ops/dataset_ops.py     | 100 ++++++---
 .../api/golden/tensorflow.data.-dataset.pbtxt |   4 +-
 ...ow.data.-fixed-length-record-dataset.pbtxt |   4 +-
 .../tensorflow.data.-t-f-record-dataset.pbtxt |   4 +-
 .../tensorflow.data.-text-line-dataset.pbtxt  |   4 +-
 15 files changed, 489 insertions(+), 142 deletions(-)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BatchDatasetV2.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_PaddedBatchDatasetV2.pbtxt

diff --git a/tensorflow/contrib/data/python/ops/batching.py b/tensorflow/contrib/data/python/ops/batching.py
index 50c2d17592..17256eb972 100644
--- a/tensorflow/contrib/data/python/ops/batching.py
+++ b/tensorflow/contrib/data/python/ops/batching.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 
 
 def dense_to_sparse_batch(batch_size, row_shape):
@@ -219,6 +220,8 @@ def filter_irregular_batches(batch_size):
   return _apply_fn
 
 
+@deprecation.deprecated(
+    None, "Use `tf.data.Dataset.batch(..., drop_remainder=True)`.")
 def batch_and_drop_remainder(batch_size):
   """A batching transformation that omits the final small batch (if present).
 
@@ -251,12 +254,16 @@ def batch_and_drop_remainder(batch_size):
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
+    # TODO(jsimsa): Switch to using `batch(..., drop_remainder=True)` any time
+    # after 6/30/2018.
     batched = dataset.batch(batch_size)
     return filter_irregular_batches(batch_size)(batched)
 
   return _apply_fn
 
 
+@deprecation.deprecated(
+    None, "Use `tf.data.Dataset.padded_batch(..., drop_remainder=True)`.")
 def padded_batch_and_drop_remainder(batch_size,
                                     padded_shapes,
                                     padding_values=None):
@@ -285,6 +292,8 @@ def padded_batch_and_drop_remainder(batch_size,
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
+    # TODO(jsimsa): Switch to using `padded_batch(..., drop_remainder=True)`
+    # any time after 6/30/2018.
     batched = dataset.padded_batch(
         batch_size, padded_shapes=padded_shapes, padding_values=padding_values)
     return filter_irregular_batches(batch_size)(batched)
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchDatasetV2.pbtxt
new file mode 100644
index 0000000000..0c5b1eb45a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BatchDatasetV2.pbtxt
@@ -0,0 +1,18 @@
+op {
+  graph_op_name: "BatchDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a batch.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  summary: "Creates a dataset that batches `batch_size` elements from `input_dataset`."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDatasetV2.pbtxt
new file mode 100644
index 0000000000..9fefc0c418
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDatasetV2.pbtxt
@@ -0,0 +1,35 @@
+op {
+  graph_op_name: "PaddedBatchDatasetV2"
+  visibility: HIDDEN
+  in_arg {
+    name: "batch_size"
+    description: <<END
+A scalar representing the number of elements to accumulate in a
+batch.
+END
+  }
+  in_arg {
+    name: "drop_remainder"
+    description: <<END
+A scalar representing whether the last batch should be dropped in case its size
+is smaller than desired.
+END
+  }
+  in_arg {
+    name: "padded_shapes"
+    description: <<END
+A list of int64 tensors representing the desired padded shapes
+of the corresponding output components. These shapes may be partially
+specified, using `-1` to indicate that a particular dimension should be
+padded to the maximum size of all batch elements.
+END
+  }
+  in_arg {
+    name: "padding_values"
+    description: <<END
+A list of scalars containing the padding value to use for
+each of the outputs.
+END
+  }
+  summary: "Creates a dataset that batches and pads `batch_size` elements from the input."
+}
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index a28b21224e..1e8cbb9784 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -40,7 +40,7 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
   GraphView graph(output);
   std::set<string> nodes_to_delete;
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "BatchDataset") {
+    if (node.op() != "BatchDataset" && node.op() != "BatchDatasetV2") {
       continue;
     }
 
@@ -93,7 +93,9 @@ Status MapAndBatchFusion::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     // Set the `drop_remainder` input argument.
-    {
+    if (batch_node.op() == "BatchDatasetV2") {
+      new_node->add_input(batch_node.input(2));
+    } else {
       NodeDef* tmp;
       TF_RETURN_IF_ERROR(
           graph_utils::AddScalarConstNode<bool>(false, output, &tmp));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index 76d2f5d537..3c1d8d5359 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -112,6 +112,95 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
                                  batch_node->attr().at("output_types")));
 }
 
+TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
+  GrapplerItem item;
+  GraphDef *graph = &item.graph;
+  NodeDef *start_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(0, graph, &start_node));
+  NodeDef *stop_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(10, graph, &stop_node));
+  NodeDef *step_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<int64>(1, graph, &step_node));
+
+  std::vector<string> range_inputs(3);
+  range_inputs[0] = start_node->name();
+  range_inputs[1] = stop_node->name();
+  range_inputs[2] = step_node->name();
+  std::vector<std::pair<string, AttrValue>> range_attrs;
+  NodeDef *range_node;
+  TF_ASSERT_OK(graph_utils::AddNode("", "RangeDataset", range_inputs,
+                                    range_attrs, graph, &range_node));
+  NodeDef *captured_input_node;
+  TF_ASSERT_OK(graph_utils::AddScalarConstNode<StringPiece>(
+      "hello", graph, &captured_input_node));
+
+  NodeDef *map_node;
+  {
+    std::vector<string> map_inputs(2);
+    map_inputs[0] = range_node->name();
+    map_inputs[1] = captured_input_node->name();
+    std::vector<std::pair<string, AttrValue>> map_attrs(2);
+    AttrValue f_attr;
+    SetAttrValue("f", &f_attr);
+    map_attrs[0] = std::make_pair("f", f_attr);
+    AttrValue args_attr;
+    SetAttrValue("Targuments", &args_attr);
+    map_attrs[1] = std::make_pair("Targuments", args_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "MapDataset", map_inputs, map_attrs,
+                                      graph, &map_node));
+  }
+
+  NodeDef *batch_size_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<int64>(5, graph, &batch_size_node));
+  NodeDef *drop_remainder_node;
+  TF_ASSERT_OK(
+      graph_utils::AddScalarConstNode<bool>(true, graph, &drop_remainder_node));
+  NodeDef *batch_node;
+  {
+    std::vector<string> batch_inputs(3);
+    batch_inputs[0] = map_node->name();
+    batch_inputs[1] = batch_size_node->name();
+    batch_inputs[2] = drop_remainder_node->name();
+    std::vector<std::pair<string, AttrValue>> batch_attrs(2);
+    AttrValue shapes_attr;
+    SetAttrValue("output_shapes", &shapes_attr);
+    batch_attrs[0] = std::make_pair("output_shapes", shapes_attr);
+    AttrValue types_attr;
+    SetAttrValue("output_types", &types_attr);
+    batch_attrs[1] = std::make_pair("output_types", types_attr);
+    TF_ASSERT_OK(graph_utils::AddNode("", "BatchDatasetV2", batch_inputs,
+                                      batch_attrs, graph, &batch_node));
+  }
+
+  MapAndBatchFusion optimizer;
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(map_node->name(), output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithName(batch_node->name(), output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  NodeDef map_and_batch_node =
+      output.node(graph_utils::FindNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_EQ(map_and_batch_node.input_size(), 5);
+  EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
+  EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
+  EXPECT_EQ(map_and_batch_node.input(2), batch_node->input(1));
+  NodeDef num_parallel_calls_node = output.node(
+      graph_utils::FindNodeWithName(map_and_batch_node.input(3), output));
+  EXPECT_EQ(num_parallel_calls_node.attr().at("value").tensor().int64_val(0),
+            1);
+  EXPECT_EQ(map_and_batch_node.input(4), batch_node->input(2));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("f"),
+                                 map_node->attr().at("f")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("Targuments"),
+                                 map_node->attr().at("Targuments")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_shapes"),
+                                 batch_node->attr().at("output_shapes")));
+  EXPECT_TRUE(AreAttrValuesEqual(map_and_batch_node.attr().at("output_types"),
+                                 batch_node->attr().at("output_types")));
+}
+
 TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
   GrapplerItem item;
   GraphDef *graph = &item.graph;
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 9a83c16f33..58b86f2a08 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -27,7 +27,8 @@ namespace {
 class BatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit BatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx),
+        op_version_(ctx->def().op() == "BatchDataset" ? 1 : 2) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
@@ -38,14 +39,24 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
         ctx, batch_size > 0,
         errors::InvalidArgument("Batch size must be greater than zero."));
 
-    *output = new Dataset(ctx, batch_size, input);
+    bool drop_remainder = false;
+    if (op_version_ > 1) {
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "drop_remainder",
+                                                    &drop_remainder));
+    }
+
+    *output = new Dataset(ctx, batch_size, drop_remainder, input);
   }
 
  private:
   class Dataset : public GraphDatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, int64 batch_size, const DatasetBase* input)
-        : GraphDatasetBase(ctx), batch_size_(batch_size), input_(input) {
+    Dataset(OpKernelContext* ctx, int64 batch_size, bool drop_remainder,
+            const DatasetBase* input)
+        : GraphDatasetBase(ctx),
+          batch_size_(batch_size),
+          drop_remainder_(drop_remainder),
+          input_(input) {
       input_->Ref();
 
       // NOTE(mrry): Currently we implement "batch up to" semantics. If
@@ -54,8 +65,13 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       const auto& input_shapes = input_->output_shapes();
       output_shapes_.reserve(input_shapes.size());
       for (const auto& input_shape : input_shapes) {
-        output_shapes_.emplace_back(
-            PartialTensorShape({-1}).Concatenate(input_shape));
+        if (drop_remainder_) {
+          output_shapes_.emplace_back(
+              PartialTensorShape({batch_size_}).Concatenate(input_shape));
+        } else {
+          output_shapes_.emplace_back(
+              PartialTensorShape({-1}).Concatenate(input_shape));
+        }
       }
     }
 
@@ -86,8 +102,10 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddParentDataset(ctx, input_, &input_graph_node));
       Node* batch_size = nullptr;
       TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {input_graph_node, batch_size}, output));
+      Node* drop_remainder = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, batch_size, drop_remainder}, output));
       return Status::OK();
     }
 
@@ -133,6 +151,12 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
+        if (dataset()->drop_remainder_ &&
+            batch_elements.size() < dataset()->batch_size_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
         // Copy the retrieved batch elements into one output tensor
         // per tuple component.
         // NOTE(mrry): If the input or output sizes are statically
@@ -201,14 +225,20 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const int64 batch_size_;
+    const bool drop_remainder_;
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> output_shapes_;
   };
+
+  const int op_version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("BatchDataset").Device(DEVICE_CPU),
                         BatchDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("BatchDatasetV2").Device(DEVICE_CPU),
+                        BatchDatasetOp);
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index d9e43ace39..59cbdb655d 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -28,7 +28,8 @@ namespace {
 class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   explicit PaddedBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {}
+      : UnaryDatasetOpKernel(ctx),
+        op_version_(ctx->def().op() == "PaddedBatchDataset" ? 1 : 2) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    DatasetBase** output) override {
@@ -39,6 +40,12 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         ctx, batch_size > 0,
         errors::InvalidArgument("Batch size must be greater than zero."));
 
+    bool drop_remainder = false;
+    if (op_version_ > 1) {
+      OP_REQUIRES_OK(ctx, ParseScalarArgument<bool>(ctx, "drop_remainder",
+                                                    &drop_remainder));
+    }
+
     OpInputList padded_shape_tensors;
     OP_REQUIRES_OK(ctx,
                    ctx->input_list("padded_shapes", &padded_shape_tensors));
@@ -85,18 +92,20 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       padding_values.push_back(tensor::DeepCopy(padding_value_t));
     }
 
-    *output = new Dataset(ctx, batch_size, std::move(padded_shapes),
-                          std::move(padding_values), input);
+    *output =
+        new Dataset(ctx, batch_size, drop_remainder, std::move(padded_shapes),
+                    std::move(padding_values), input);
   }
 
  private:
   class Dataset : public GraphDatasetBase {
    public:
-    Dataset(OpKernelContext* ctx, int64 batch_size,
+    Dataset(OpKernelContext* ctx, int64 batch_size, bool drop_remainder,
             std::vector<PartialTensorShape> padded_shapes,
             std::vector<Tensor> padding_values, const DatasetBase* input)
         : GraphDatasetBase(ctx),
           batch_size_(batch_size),
+          drop_remainder_(drop_remainder),
           padded_shapes_(std::move(padded_shapes)),
           padding_values_(std::move(padding_values)),
           input_(input) {
@@ -112,8 +121,13 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
       const auto& input_shapes = input_->output_shapes();
       output_shapes_.reserve(input_shapes.size());
       for (size_t i = 0; i < input_shapes.size(); ++i) {
-        output_shapes_.push_back(
-            PartialTensorShape({-1}).Concatenate(padded_shapes_[i]));
+        if (drop_remainder_) {
+          output_shapes_.push_back(
+              PartialTensorShape({batch_size_}).Concatenate(padded_shapes_[i]));
+        } else {
+          output_shapes_.push_back(
+              PartialTensorShape({-1}).Concatenate(padded_shapes_[i]));
+        }
       }
     }
 
@@ -166,16 +180,19 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
         padding_values.emplace_back(node);
       }
 
+      Node* drop_remainder = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder));
+
       AttrValue output_types;
       b->BuildAttrValue(output_dtypes(), &output_types);
 
       AttrValue N;
       b->BuildAttrValue<int64>(padded_shapes_.size(), &N);
 
-      TF_RETURN_IF_ERROR(
-          b->AddDataset(this, {{0, input_graph_node}, {1, batch_size}},
-                        {{2, padded_shapes}, {3, padding_values}},
-                        {{"Toutput_types", output_types}, {"N", N}}, output));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {{0, input_graph_node}, {1, batch_size}, {4, drop_remainder}},
+          {{2, padded_shapes}, {3, padding_values}},
+          {{"Toutput_types", output_types}, {"N", N}}, output));
       return Status::OK();
     }
 
@@ -226,6 +243,12 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
+        if (dataset()->drop_remainder_ &&
+            batch_elements.size() < dataset()->batch_size_) {
+          *end_of_sequence = true;
+          return Status::OK();
+        }
+
         // Copy the retrieved batch elements into one output tensor
         // per tuple component.
         // NOTE(mrry): If the input or output sizes are statically
@@ -341,16 +364,22 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const int64 batch_size_;
+    const bool drop_remainder_;
     const std::vector<PartialTensorShape> padded_shapes_;
     const std::vector<Tensor> padding_values_;
     const DatasetBase* const input_;
     std::vector<PartialTensorShape> output_shapes_;
   };
+
+  const int op_version_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("PaddedBatchDataset").Device(DEVICE_CPU),
                         PaddedBatchDatasetOp);
 
+REGISTER_KERNEL_BUILDER(Name("PaddedBatchDatasetV2").Device(DEVICE_CPU),
+                        PaddedBatchDatasetOp);
+
 }  // namespace
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 0e13d41977..15e0ca8af9 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -355,6 +355,22 @@ REGISTER_OP("BatchDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("BatchDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // drop_remainder should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 // TODO(mrry): move SlideDataset to contrib in the future.
 REGISTER_OP("SlideDataset")
     .Input("input_dataset: variant")
@@ -371,6 +387,10 @@ REGISTER_OP("SlideDataset")
       return shape_inference::ScalarShape(c);
     });
 
+// TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
+// `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
+// possible to tell statically) compatible with `padded_shapes`, and that
+// `padding_values` are all scalars.
 REGISTER_OP("PaddedBatchDataset")
     .Input("input_dataset: variant")
     .Input("batch_size: int64")
@@ -380,17 +400,32 @@ REGISTER_OP("PaddedBatchDataset")
     .Attr("Toutput_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);  // TODO(mrry): Validate that
-                                                // `padded_shapes` are all
-                                                // vectors, the lengths of
-                                                // `output_types` and
-                                                // `output_shapes` are `N`,
-                                                // the `output_shapes` are (as
-                                                // far as possible to tell
-                                                // statically) compatible with
-                                                // `padded_shapes`, and
-                                                // that `padding_values` are
-                                                // all scalars.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("PaddedBatchDatasetV2")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("padded_shapes: N * int64")
+    .Input("padding_values: Toutput_types")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("Toutput_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // drop_remainder should be a scalar.
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("DenseToSparseBatchDataset")
     .Input("input_dataset: variant")
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index c8fabc4363..e86c2f6993 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -15,6 +15,7 @@ tf_py_test(
     size = "small",
     srcs = ["batch_dataset_op_test.py"],
     additional_deps = [
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
index dba108a531..50bb0837b7 100644
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
@@ -18,8 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
@@ -35,73 +34,83 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class BatchDatasetTest(test.TestCase):
+class BatchDatasetTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('even', 28, 14, False),
+      ('uneven_with_remainder', 28, 15, False),
+      ('uneven_without_remainder', 28, 15, True),
+      ('empty', 0, 14, False),
+  )
+  def testBatchDataset(self, count, batch_size, drop_remainder):
+    """Tests the batch dataset logic for various input configurations.
+
+    Args:
+      count: the number of input elements
+      batch_size: the batch size
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
 
-  def testBatchDataset(self):
-    """Test an dataset that maps a TF function across its input elements."""
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) -> BatchDataset(batch_size).
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
                   np.array(37.0) * np.arange(7))
 
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
+    count_t = array_ops.placeholder(dtypes.int64, shape=[])
+    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
+    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
 
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
     iterator = (
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size).make_initializable_iterator())
+        .repeat(count).batch(batch_size,
+                             drop_remainder).make_initializable_iterator())
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
+    if drop_remainder:
+      dim0 = batch_size
+    else:
+      dim0 = None
+    self.assertEqual([[dim0] + list(c.shape[1:]) for c in components],
                      [t.shape.as_list() for t in get_next])
 
     with self.test_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
+      sess.run(
+          init_op,
+          feed_dict={
+              count_t: count,
+              batch_size_t: batch_size,
+              drop_remainder_t: drop_remainder
+          })
+      num_full_batches = (count * 7) // batch_size
+      for i in range(num_full_batches):
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
+          for j in range(batch_size):
+            self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
                                 result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
+      if not drop_remainder and (count * 7) % batch_size > 0:
         result = sess.run(get_next)
         for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = sess.run(get_next)
-      for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
-                              result_component[j])
+          for j in range((count * 7) % batch_size):
+            self.assertAllEqual(
+                component[(num_full_batches * batch_size + j) % 7]**2,
+                result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testBatchDatasetInvalidBatchSize(self):
+    iterator = (dataset_ops.Dataset.range(10).batch(0).make_one_shot_iterator())
+    get_next = iterator.get_next()
 
-      # Empty batch should be an initialization time error.
+    with self.test_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
+        sess.run(get_next)
 
   def assertSparseValuesEqual(self, a, b):
     self.assertAllEqual(a.indices, b.indices)
@@ -210,66 +219,108 @@ class BatchDatasetTest(test.TestCase):
           r'First element had shape \[3\] and element 2 had shape \[4\].'):
         sess.run(next_element)
 
-  def testPaddedBatchDataset(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
+
+def _random_seq_lens(count):
+  return np.random.randint(20, size=(count,)).astype(np.int32)
+
+
+class PaddedBatchDatasetTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default_padding', _random_seq_lens(32), 4, [-1], False),
+      ('constant_padding', _random_seq_lens(32), 4, [25], False),
+      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
+      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
+  )
+  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
+                             drop_remainder):
+    """Tests the padded batch dataset logic for various input configurations.
+
+    Args:
+      seq_lens: the input sequence lengths
+      batch_size: the batch size
+      padded_shapes: the padded shapes to use
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    seq_lens_t = array_ops.placeholder(dtypes.int32, shape=[None])
+    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
+    padded_shapes_t = array_ops.placeholder(dtypes.int64, shape=[1])
+    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
 
     iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens)
+        dataset_ops.Dataset.from_tensor_slices(seq_lens_t)
         .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            4, padded_shapes=padded_shape).make_initializable_iterator())
+            batch_size=batch_size_t,
+            drop_remainder=drop_remainder_t,
+            padded_shapes=padded_shapes_t).make_initializable_iterator())
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.test_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
       sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
+          init_op,
+          feed_dict={
+              seq_lens_t: seq_lens,
+              batch_size_t: batch_size,
+              padded_shapes_t: padded_shapes,
+              drop_remainder_t: drop_remainder,
           })
-      for i in range(8):
+
+      num_full_batches = len(seq_lens) // batch_size
+
+      for i in range(num_full_batches):
         result = sess.run(get_next)
-        padded_len = np.max(result)
-        self.assertEqual((4, padded_len), result.shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
+        padded_len = padded_shapes[0]
+        if padded_len is None or padded_len == -1:
+          padded_len = np.max(result)
+        self.assertEqual((batch_size, padded_len), result.shape)
+        for j in range(batch_size):
+          seq_len = seq_lens[(i * batch_size) + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+          self.assertAllEqual(result[j, seq_len:],
+                              [0] * (padded_len - seq_len))
 
-      # Test with random sequence lengths, and constant padding.
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [25],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
+      if not drop_remainder and len(seq_lens) % batch_size > 0:
         result = sess.run(get_next)
-        self.assertEqual((4, 25), result.shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
+        padded_len = np.max(result)
+        self.assertEqual((len(seq_lens) % batch_size, padded_len),
+                         result.shape)
+        for j in range(len(seq_lens) % batch_size):
+          seq_len = seq_lens[num_full_batches * batch_size + j]
           self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:], [0] * (25 - seq_len))
+          self.assertAllEqual(result[j, seq_len:],
+                              [0] * (padded_len - seq_len))
+
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-      # Test correct handling of empty tensors.
-      sess.run(init_op, feed_dict={padded_shape: [-1], seq_lens: [0, 0, 0, 0]})
+  def testPaddedBatchShortPadding(self):
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices([6, 5, 5, 5, 5])
+        .map(lambda x: array_ops.fill([x], x)).padded_batch(
+            batch_size=4, padded_shapes=[5]).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      with self.assertRaises(errors.DataLossError):
+        sess.run(get_next)
+
+  def testPaddedBatchEmptyTensors(self):
+    iterator = (
+        dataset_ops.Dataset.from_tensor_slices([0, 0, 0, 0])
+        .map(lambda x: array_ops.fill([x], x)).padded_batch(
+            batch_size=4, padded_shapes=[-1]).make_one_shot_iterator())
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
       result = sess.run(get_next)
       self.assertAllEqual([[], [], [], []], result)
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
-      # Test error handling with constant sequence lengths, and
-      # too-short padding.
-      sess.run(init_op, feed_dict={padded_shape: [5], seq_lens: [6, 5, 5, 5]})
-      with self.assertRaises(errors.DataLossError):
-        result = sess.run(get_next)
-
   def testPaddedBatchDatasetNonDefaultPadding(self):
     seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
     padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 2ec6c6f154..672ce014f6 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -791,7 +792,7 @@ class Dataset(object):
 
     return self._enumerate().filter(filter_fn).map(lambda _, elem: elem)
 
-  def batch(self, batch_size):
+  def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
     NOTE: If the number of elements (`N`) in this dataset is not an exact
@@ -803,13 +804,21 @@ class Dataset(object):
     Args:
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
         consecutive elements of this dataset to combine in a single batch.
+      drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+        whether the last batch should be dropped in the case its has fewer than
+        `batch_size` elements; the default behavior is not to drop the smaller
+        batch.
 
     Returns:
       Dataset: A `Dataset`.
     """
-    return BatchDataset(self, batch_size)
+    return BatchDataset(self, batch_size, drop_remainder)
 
-  def padded_batch(self, batch_size, padded_shapes, padding_values=None):
+  def padded_batch(self,
+                   batch_size,
+                   padded_shapes,
+                   padding_values=None,
+                   drop_remainder=False):
     """Combines consecutive elements of this dataset into padded batches.
 
     This transformation combines multiple consecutive elements of the input
@@ -852,11 +861,16 @@ class Dataset(object):
         `tf.Tensor`, representing the padding values to use for the
         respective components.  Defaults are `0` for numeric types and
         the empty string for string types.
+      drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
+        whether the last batch should be dropped in the case its has fewer than
+        `batch_size` elements; the default behavior is not to drop the smaller
+        batch.
 
     Returns:
       Dataset: A `Dataset`.
     """
-    return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values)
+    return PaddedBatchDataset(self, batch_size, padded_shapes, padding_values,
+                              drop_remainder)
 
   def map(self, map_func, num_parallel_calls=None):
     """Maps `map_func` across this dataset.
@@ -1655,21 +1669,34 @@ class SkipDataset(Dataset):
 class BatchDataset(Dataset):
   """A `Dataset` that batches contiguous elements from its input."""
 
-  def __init__(self, input_dataset, batch_size):
+  def __init__(self, input_dataset, batch_size, drop_remainder):
     """See `Dataset.batch()` for details."""
     super(BatchDataset, self).__init__()
     self._input_dataset = input_dataset
     self._batch_size = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
+    self._drop_remainder = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.batch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        batch_size=self._batch_size,
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)))
+    # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
+    if smart_cond.smart_constant_value(self._drop_remainder) is False:
+      return gen_dataset_ops.batch_dataset(
+          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          batch_size=self._batch_size,
+          output_shapes=nest.flatten(
+              sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+          output_types=nest.flatten(
+              sparse.as_dense_types(self.output_types, self.output_classes)))
+    else:
+      return gen_dataset_ops.batch_dataset_v2(
+          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          batch_size=self._batch_size,
+          drop_remainder=self._drop_remainder,
+          output_shapes=nest.flatten(
+              sparse.as_dense_shapes(self.output_shapes, self.output_classes)),
+          output_types=nest.flatten(
+              sparse.as_dense_types(self.output_types, self.output_classes)))
 
   @property
   def output_classes(self):
@@ -1679,7 +1706,9 @@ class BatchDataset(Dataset):
   def output_shapes(self):
     input_shapes = self._input_dataset.output_shapes
     return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(None).concatenate(s)
+        tensor_shape.vector(
+            tensor_util.constant_value(self._batch_size) if smart_cond.
+            smart_constant_value(self._drop_remainder) else None).concatenate(s)
         for s in nest.flatten(self._input_dataset.output_shapes)
     ])
 
@@ -1800,7 +1829,8 @@ def _default_padding(input_dataset):
 class PaddedBatchDataset(Dataset):
   """A `Dataset` that batches and pads contiguous elements from its input."""
 
-  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
+  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values,
+               drop_remainder):
     """See `Dataset.batch()` for details."""
     super(PaddedBatchDataset, self).__init__()
     if sparse.any_sparse(input_dataset.output_classes):
@@ -1830,18 +1860,34 @@ class PaddedBatchDataset(Dataset):
     self._padding_values = nest.map_structure_up_to(
         input_dataset.output_shapes, _padding_value_to_tensor, padding_values,
         input_dataset.output_types)
+    self._drop_remainder = ops.convert_to_tensor(
+        drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.padded_batch_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        batch_size=self._batch_size,
-        padded_shapes=[
-            ops.convert_to_tensor(s, dtype=dtypes.int64)
-            for s in nest.flatten(self._padded_shapes)
-        ],
-        padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
+    if smart_cond.smart_constant_value(self._drop_remainder) is False:
+      return gen_dataset_ops.padded_batch_dataset(
+          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          batch_size=self._batch_size,
+          padded_shapes=[
+              ops.convert_to_tensor(s, dtype=dtypes.int64)
+              for s in nest.flatten(self._padded_shapes)
+          ],
+          padding_values=nest.flatten(self._padding_values),
+          output_shapes=nest.flatten(
+              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+    else:
+      return gen_dataset_ops.padded_batch_dataset_v2(
+          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          batch_size=self._batch_size,
+          padded_shapes=[
+              ops.convert_to_tensor(s, dtype=dtypes.int64)
+              for s in nest.flatten(self._padded_shapes)
+          ],
+          padding_values=nest.flatten(self._padding_values),
+          drop_remainder=self._drop_remainder,
+          output_shapes=nest.flatten(
+              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
 
   @property
   def output_classes(self):
@@ -1851,8 +1897,10 @@ class PaddedBatchDataset(Dataset):
   def output_shapes(self):
 
     def _padded_shape_to_batch_shape(s):
-      return tensor_shape.vector(None).concatenate(
-          tensor_util.constant_value_as_shape(s))
+      return tensor_shape.vector(
+          tensor_util.constant_value(self._batch_size) if smart_cond.
+          smart_constant_value(self._drop_remainder) else None).concatenate(
+              tensor_util.constant_value_as_shape(s))
 
     return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
 
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
index 8e7e945ed1..834f0954d5 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-dataset.pbtxt
@@ -24,7 +24,7 @@ tf_class {
   }
   member_method {
     name: "batch"
-    argspec: "args=[\'self\', \'batch_size\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cache"
@@ -80,7 +80,7 @@ tf_class {
   }
   member_method {
     name: "padded_batch"
-    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "prefetch"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 5cfb2fd2f0..4d854a4cee 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -25,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "batch"
-    argspec: "args=[\'self\', \'batch_size\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cache"
@@ -81,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "padded_batch"
-    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "prefetch"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
index 3327e5b274..601f095a60 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -25,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "batch"
-    argspec: "args=[\'self\', \'batch_size\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cache"
@@ -81,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "padded_batch"
-    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "prefetch"
diff --git a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
index 9d59375282..587829a4c0 100644
--- a/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.data.-text-line-dataset.pbtxt
@@ -25,7 +25,7 @@ tf_class {
   }
   member_method {
     name: "batch"
-    argspec: "args=[\'self\', \'batch_size\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'batch_size\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cache"
@@ -81,7 +81,7 @@ tf_class {
   }
   member_method {
     name: "padded_batch"
-    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\', \'batch_size\', \'padded_shapes\', \'padding_values\', \'drop_remainder\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
   }
   member_method {
     name: "prefetch"
-- 
GitLab


From f8f70a84c12ab432094f762082e82f5decfe3414 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 16:55:40 -0700
Subject: [PATCH 518/610] Internal change.

PiperOrigin-RevId: 199871863
---
 tensorflow/contrib/lite/kernels/internal/kernel_utils.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
index 09044193c1..36c25388e8 100644
--- a/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
+++ b/tensorflow/contrib/lite/kernels/internal/kernel_utils.cc
@@ -409,7 +409,7 @@ void LstmStep(
   }
 
   // Save quantization and matmul computation for all zero input.
-  const bool is_cell_state_all_zeros =
+  bool is_cell_state_all_zeros =
       tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
 
   // For each batch and cell: update input gate.
@@ -455,6 +455,8 @@ void LstmStep(
                              params->cell_clip, cell_state_ptr);
   }
 
+  is_cell_state_all_zeros =
+      tensor_utils::IsZeroVector(cell_state_ptr, n_batch * n_cell);
   // For each batch and cell: update the output gate.
   if (use_peephole && !is_cell_state_all_zeros) {
     VectorMultiply(cell_to_output_weights_ptr, n_cell,
-- 
GitLab


From 245651f9dce1e787ceb55a3155b26ab45552fc4f Mon Sep 17 00:00:00 2001
From: Ruoxin Sang <rxsang@google.com>
Date: Fri, 8 Jun 2018 17:14:48 -0700
Subject: [PATCH 519/610] Remove logic in RandomAccessInputStream to check for
 out of range read, as it has been done in RandomAccessFile::Read().

PiperOrigin-RevId: 199873976
---
 tensorflow/core/lib/io/random_inputstream.cc | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/tensorflow/core/lib/io/random_inputstream.cc b/tensorflow/core/lib/io/random_inputstream.cc
index 09336e79cd..e85367df9c 100644
--- a/tensorflow/core/lib/io/random_inputstream.cc
+++ b/tensorflow/core/lib/io/random_inputstream.cc
@@ -45,16 +45,8 @@ Status RandomAccessInputStream::ReadNBytes(int64 bytes_to_read,
   result->resize(data.size());
   if (s.ok() || errors::IsOutOfRange(s)) {
     pos_ += data.size();
-  } else {
-    return s;
   }
-  // If the amount of data we read is less than what we wanted, we return an
-  // out of range error. We need to catch this explicitly since file_->Read()
-  // would not do so if at least 1 byte is read (b/30839063).
-  if (data.size() < bytes_to_read) {
-    return errors::OutOfRange("reached end of file");
-  }
-  return Status::OK();
+  return s;
 }
 
 // To limit memory usage, the default implementation of SkipNBytes() only reads
-- 
GitLab


From cf042e7e90c00d639904e2a5fad8a9cd9d6962da Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 17:18:22 -0700
Subject: [PATCH 520/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 199874337
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 105 ++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 105 ++++++++++++++++++
 2 files changed, 210 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 71f34b3abe..8f8c90ee97 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -8720,6 +8720,37 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchFFT"
   input_arg {
@@ -35817,6 +35848,52 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PaddingFIFOQueue"
   output_arg {
@@ -69521,6 +69598,34 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayGradWithShape"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "shape_to_prepend"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayPack"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 718c1510ed..d3f3e87dfd 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3004,6 +3004,37 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "BatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "BatchFFT"
   input_arg {
@@ -17489,6 +17520,52 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "PaddedBatchDatasetV2"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "padded_shapes"
+    type: DT_INT64
+    number_attr: "N"
+  }
+  input_arg {
+    name: "padding_values"
+    type_list_attr: "Toutput_types"
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "Toutput_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "PaddingFIFOQueue"
   output_arg {
@@ -32439,6 +32516,34 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorArrayGradWithShape"
+  input_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "flow_in"
+    type: DT_FLOAT
+  }
+  input_arg {
+    name: "shape_to_prepend"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "grad_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "flow_out"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "source"
+    type: "string"
+  }
+  is_stateful: true
+}
 op {
   name: "TensorArrayPack"
   input_arg {
-- 
GitLab


From 49a729901484a413fd605be735da9a563c24336a Mon Sep 17 00:00:00 2001
From: Alan Chiao <alanchiao@google.com>
Date: Fri, 8 Jun 2018 17:19:46 -0700
Subject: [PATCH 521/610] Hybrid embedding lookup op

PiperOrigin-RevId: 199874482
---
 .../contrib/lite/kernels/embedding_lookup.cc  |  57 ++++++++-
 .../lite/kernels/embedding_lookup_test.cc     | 110 +++++++++++++++---
 2 files changed, 147 insertions(+), 20 deletions(-)

diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup.cc b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
index 7539c0b30d..9410bead5e 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup.cc
@@ -24,7 +24,8 @@ limitations under the License.
 // Output:
 //   Output.dim[0] == Tensor[0].dim[0], num of lookups
 //   Output.dim[1] == Tensor[1].dim[1],  num of items per row
-//   Each item in output is a raw bytes copy of corresponding item in input.
+//   Each item in output is a raw bytes copy of the corresponding item in input,
+//   or a dequantized value in the case of a uint8 input.
 //   When indices are out of bound, the ops will not succeed.
 //
 
@@ -69,11 +70,9 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   return context->ResizeTensor(context, output, outputSize);
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  TfLiteTensor* output = GetOutput(context, node, 0);
-  const TfLiteTensor* lookup = GetInput(context, node, 0);
-  const TfLiteTensor* value = GetInput(context, node, 1);
-
+TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
+                       const TfLiteTensor* lookup, const TfLiteTensor* value,
+                       TfLiteTensor* output) {
   const int row_size = SizeOfDimension(value, 0);
   const int row_bytes = value->bytes / row_size;
 
@@ -91,6 +90,52 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   return kTfLiteOk;
 }
 
+TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
+                        const TfLiteTensor* lookup, const TfLiteTensor* value,
+                        TfLiteTensor* output) {
+  const int row_size = SizeOfDimension(value, 0);
+  const double scaling_factor = 1.0 / value->params.scale;
+
+  // col_size after we flatten tensor into 2D.
+  int col_size = 1;
+  for (int i = 1; i < NumDimensions(value); i++) {
+    col_size *= SizeOfDimension(value, i);
+  }
+
+  for (int i = 0; i < SizeOfDimension(lookup, 0); i++) {
+    int idx = lookup->data.i32[i];
+    if (idx >= row_size || idx < 0) {
+      context->ReportError(context, "Embedding Lookup: index out of bounds.");
+      return kTfLiteError;
+    } else {
+      // Dequantize embedding values.
+      // TODO(alanchiao): refactor scalar multiply into separate function
+      // for ease of adding a neon equivalent if ever necessary.
+      for (int j = 0; j < col_size; j++) {
+        output->data.f[j + i * col_size] =
+            value->data.uint8[j + idx * col_size] * scaling_factor;
+      }
+    }
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* lookup = GetInput(context, node, 0);
+  const TfLiteTensor* value = GetInput(context, node, 1);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  switch (value->type) {
+    case kTfLiteFloat32:
+      return EvalFloat(context, node, lookup, value, output);
+    case kTfLiteUInt8:
+      return EvalHybrid(context, node, lookup, value, output);
+    default:
+      context->ReportError(context, "Type not currently supported.");
+      return kTfLiteError;
+  }
+}
+
 }  // namespace embedding_lookup
 
 TfLiteRegistration* Register_EMBEDDING_LOOKUP() {
diff --git a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
index 9b501878f1..04657fd863 100644
--- a/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
+++ b/tensorflow/contrib/lite/kernels/embedding_lookup_test.cc
@@ -7,13 +7,14 @@ You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
+distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License
+for the specific language governing permissions and limitations under the
+License.
 ==============================================================================*/
 // Unit test for TFLite Lookup op.
 
+#include <initializer_list>
 #include <iomanip>
 #include <vector>
 
@@ -29,12 +30,13 @@ namespace {
 
 using ::testing::ElementsAreArray;
 
-class EmbeddingLookupOpModel : public SingleOpModel {
+class BaseEmbeddingLookupOpModel : public SingleOpModel {
  public:
-  EmbeddingLookupOpModel(std::initializer_list<int> index_shape,
-                         std::initializer_list<int> weight_shape) {
+  BaseEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
+                             std::initializer_list<int> weight_shape,
+                             TensorType weight_type = TensorType_FLOAT32) {
     input_ = AddInput(TensorType_INT32);
-    weight_ = AddInput(TensorType_FLOAT32);
+    weight_ = AddInput(weight_type);
     output_ = AddOutput(TensorType_FLOAT32);
     SetBuiltinOp(BuiltinOperator_EMBEDDING_LOOKUP, BuiltinOptions_NONE, 0);
     BuildInterpreter({index_shape, weight_shape});
@@ -44,6 +46,18 @@ class EmbeddingLookupOpModel : public SingleOpModel {
     PopulateTensor(input_, data);
   }
 
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int weight_;
+  int output_;
+};
+
+class EmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
+ public:
+  using BaseEmbeddingLookupOpModel::BaseEmbeddingLookupOpModel;
+
   void Set3DWeightMatrix(const std::function<float(int, int, int)>& function) {
     TfLiteTensor* tensor = interpreter_->tensor(weight_);
     int rows = tensor->dims->data[0];
@@ -57,20 +71,25 @@ class EmbeddingLookupOpModel : public SingleOpModel {
       }
     }
   }
+};
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+class HybridEmbeddingLookupOpModel : public BaseEmbeddingLookupOpModel {
+ public:
+  HybridEmbeddingLookupOpModel(std::initializer_list<int> index_shape,
+                               std::initializer_list<int> weight_shape)
+      : BaseEmbeddingLookupOpModel(index_shape, weight_shape,
+                                   TensorType_UINT8) {}
 
- private:
-  int input_;
-  int weight_;
-  int output_;
+  void SetWeight(std::initializer_list<float> data) {
+    SymmetricQuantizeAndPopulate(weight_, data);
+  }
 };
 
 // TODO(ahentz): write more tests that exercise the details of the op, such as
 // lookup errors and variable input shapes.
 TEST(EmbeddingLookupOpTest, SimpleTest) {
   EmbeddingLookupOpModel m({3}, {3, 2, 4});
-  m.PopulateTensor<int>(0, {1, 0, 2});
+  m.SetInput({1, 0, 2});
   m.Set3DWeightMatrix(
       [](int i, int j, int k) { return i + j / 10.0f + k / 100.0f; });
 
@@ -84,6 +103,69 @@ TEST(EmbeddingLookupOpTest, SimpleTest) {
               })));
 }
 
+TEST(HybridEmbeddingLookupHybridOpTest, Simple2DTest) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 8});
+  m.SetInput({1, 0, 2});
+  m.SetWeight({
+      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  7.41e-03)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple3DTest) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 4});
+  m.SetInput({1, 0, 2});
+  m.SetWeight({
+      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  7.41e-03)));
+}
+
+TEST(HybridEmbeddingLookupHybridOpTest, Simple4DTest) {
+  HybridEmbeddingLookupOpModel m({3}, {3, 2, 2, 2});
+  m.SetInput({1, 0, 2});
+  m.SetWeight({
+      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+  });
+
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      1.00, 1.01, 1.02, 1.03, 1.10, 1.11, 1.12, 1.13,  // Row 1
+                      0.00, 0.01, 0.02, 0.03, 0.10, 0.11, 0.12, 0.13,  // Row 0
+                      2.00, 2.01, 2.02, 2.03, 2.10, 2.11, 2.12, 2.13,  // Row 2
+                  },
+                  7.41e-03)));
+}
+
 }  // namespace
 }  // namespace tflite
 
-- 
GitLab


From f81f62a0d35ccf7c4e83e09510447d93933ef87e Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Fri, 8 Jun 2018 17:21:47 -0700
Subject: [PATCH 522/610] Document TFLite Ops Versioning

PiperOrigin-RevId: 199874647
---
 .../contrib/lite/g3doc/ops_versioning.md      | 206 ++++++++++++++++++
 1 file changed, 206 insertions(+)
 create mode 100644 tensorflow/contrib/lite/g3doc/ops_versioning.md

diff --git a/tensorflow/contrib/lite/g3doc/ops_versioning.md b/tensorflow/contrib/lite/g3doc/ops_versioning.md
new file mode 100644
index 0000000000..bd2f797e6c
--- /dev/null
+++ b/tensorflow/contrib/lite/g3doc/ops_versioning.md
@@ -0,0 +1,206 @@
+# TensorFlow Lite Ops Versioning
+
+This document describes TensorFlow Lite's op versioning schema. Op
+versioning enables developers to add new functionalities and parameters into
+existing ops. In addition, it guarantees the following:
+
+*   Backward compatibility: New TensorFlow Lite implementation should
+    handle an old model file.
+*   Forward compatibility: Old TensorFlow Lite implementation should
+    handle a new model file produced by new version of TOCO, as long as no new
+    features are used.
+*   Forward in-compatibility detection: If an old TensorFlow Lite implementation
+    reads a new model that contains a new version of an op which isn't
+    supported, it should report the error.
+
+## Example: Adding Dilation into Convolution
+
+The remainder of this document explains op versioning in TFLite by showing how
+to add dilation parameters to the convolution operation.
+
+Knowledge of dilation is not required to understand this document. Note that:
+
+*   2 new integer parameters will be added: `dilation_width_factor` and
+    `dilation_height_factor`.
+*   Old convolution kernels that don't support dilation are equivalent to
+    setting the dilation factors to 1.
+
+### Change FlatBuffer Schema
+
+To add new parameters into an op, change the options table in
+`lite/schema/schema.fbs`.
+
+For example, the options table of convolution looks like this:
+
+```
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+}
+```
+
+When adding new parameters:
+
+*   Add comments indicating which parameters are supported by which version.
+*   When the new implementation gets the default values for newly added
+    parameters, it should work exactly the same as the old implementation.
+
+The table will be like this after the new parameters are added:
+
+```
+table Conv2DOptions {
+  // Parameters supported by version 1:
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters supported by version 2:
+  dilation_width_factor:int = 1;
+  dilation_height_factor:int = 1;
+}
+```
+
+### Change C Structures and Kernel Implementation
+
+In TensorFlow Lite, the kernel implementation is decoupled from
+FlatBuffer definition. The kernels read the parameter from C structures defined
+in `lite/builtin_op_data.h`.
+
+The original convolution parameter is as follows:
+
+```
+typedef struct {
+  TfLitePadding padding;
+  int stride_width;
+  int stride_height;
+  TfLiteFusedActivation activation;
+} TfLiteConvParams;
+```
+
+As with the FlatBuffer schema, add comments indicating which parameters are
+supported starting from which version. The result is seen below:
+
+```
+typedef struct {
+  // Parameters supported by version 1: TfLitePadding padding; int
+  stride_width;
+  int stride_height;
+  TfLiteFusedActivation activation;
+
+  // Parameters supported by version 2:
+  int dilation_width_factor;
+  int dilation_height_factor;
+} TfLiteConvParams;
+```
+
+Please also change the kernel implementation to read the newly added parameters
+from the C structures. The details are omitted here.
+
+### Change the FlatBuffer Reading Code
+
+The logic to read FlatBuffer and produce C structure is in `lite/model.cc`.
+
+Update the file to handle the new parameters, as shown below:
+
+```
+case BuiltinOperator_CONV_2D: {
+  TfLiteConvParams* params = MallocPOD<TfLiteConvParams>();
+  if (auto* conv_params = op->builtin_options_as_Conv2DOptions()) {
+    params->padding = parse_padding(conv_params->padding());
+    params->stride_width = conv_params->stride_w();
+    params->stride_height = conv_params->stride_h();
+    params->activation =
+        parse_activation(conv_params->fused_activation_function());
+    params->dilation_width_factor = conv_params->dilation_width_factor();
+    params->dilation_height_factor = conv_params->dilation_height_factor();
+  }
+  *builtin_data = reinterpret_cast<void*>(params);
+  break;
+}
+```
+
+It's not required to check the op version here. When the new implementation
+reads an old model file where dilation factors are missing, it will use 1 as
+the default value, and the new kernel will work consistently with the old
+kernel.
+
+### Change Kernel Registration
+
+The MutableOpResolver (defined in `lite/op_resolver.h`) provides a few functions
+to register op kernels. The minimum and maximum version are 1 by default:
+
+```
+void AddBuiltin(tflite::BuiltinOperator op, TfLiteRegistration* registration,
+                int min_version = 1, int max_version = 1);
+void AddCustom(const char* name, TfLiteRegistration* registration,
+               int min_version = 1, int max_version = 1);
+```
+
+The built-in ops are registered in `lite/kernels/register.cc`. In this example,
+we implemented a new op kernel which can handle `Conv2D` version 1 and 2, so we
+need to change this line:
+
+```
+AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D());
+```
+
+to:
+
+```
+AddBuiltin(BuiltinOperator_CONV_2D, Register_CONV_2D(), 1, 2);
+```
+
+### Change TOCO TFLite exporter
+
+The last step is to make TOCO populate the minimum version that's required to
+execute the op. In this example, it means:
+
+*   Populate version=1 when dilation factors are all 1.
+*   Populate version=2 otherwise.
+
+To do this, you need to override `GetVersion` function for the operator class in
+`lite/toco/tflite/operator.cc`.
+
+For ops with only one version, the `GetVersion` function is defined as:
+
+```
+int GetVersion(const Operator& op) const override { return 1; }
+```
+
+When supporting multiple versions, check the parameters and determine the
+version for the op, as shown in the following example:
+
+```
+int GetVersion(const Operator& op) const override {
+  const auto& conv_op = static_cast<const ConvOperator&>(op);
+  if (conv_op.dilation_width_factor != 1 ||
+      conv_op.dilation_height_factor != 1) {
+    return 2;
+  }
+  return 1;
+}
+```
+
+### Delegation Implementation
+
+TensorFlow Lite provides a delegation API which enables delegating ops to
+hardware backends. In Delegate's `Prepare` function, check if the version
+is supported for every node in Delegation code.
+
+```
+const int kMinVersion = 1;
+TfLiteNode* node;
+TfLiteRegistration;
+context->GetNodeAndRegistration(context, node_index, &node, &registration);
+
+if (registration->version > kMinVersion) {
+  // Reject the node if the version isn't supported.
+}
+```
+
+This is required even if the delegation only supports version 1 ops, so the
+delegation can detect incompatibility when getting a higher version op.
+
-- 
GitLab


From 80459fe0fdcb86b286311559c65a7ec43525e278 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 17:39:58 -0700
Subject: [PATCH 523/610] Cleanup shape_inference.

PiperOrigin-RevId: 199876297
---
 .../compiler/xla/service/shape_inference.cc   | 346 +++++-------------
 .../compiler/xla/service/shape_inference.h    |  18 +-
 .../xla/service/shape_inference_test.cc       | 144 ++++----
 .../xla/tests/broadcast_simple_test.cc        |   4 +-
 tensorflow/compiler/xla/tests/map_test.cc     |   7 +-
 tensorflow/compiler/xla/xla_data.proto        | 126 -------
 6 files changed, 177 insertions(+), 468 deletions(-)

diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index fdc7f41759..bd98e86b08 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -44,129 +44,6 @@ namespace xla {
 
 namespace {
 
-// Return the UnaryOperation proto enum value associated with the given HLO
-// opcode.
-UnaryOperation OpcodeToUnaryOperation(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kAbs:
-      return UNOP_ABS;
-    case HloOpcode::kCeil:
-      return UNOP_CEIL;
-    case HloOpcode::kClz:
-      return UNOP_CLZ;
-    case HloOpcode::kCos:
-      return UNOP_COS;
-    case HloOpcode::kExp:
-      return UNOP_EXP;
-    case HloOpcode::kExpm1:
-      return UNOP_EXPM1;
-    case HloOpcode::kFloor:
-      return UNOP_FLOOR;
-    case HloOpcode::kImag:
-      return UNOP_IMAG;
-    case HloOpcode::kIsFinite:
-      return UNOP_IS_FINITE;
-    case HloOpcode::kLog:
-      return UNOP_LOG;
-    case HloOpcode::kLog1p:
-      return UNOP_LOG1P;
-    case HloOpcode::kNot:
-      return UNOP_NOT;
-    case HloOpcode::kNegate:
-      return UNOP_NEGATE;
-    case HloOpcode::kReal:
-      return UNOP_REAL;
-    case HloOpcode::kRoundNearestAfz:
-      return UNOP_ROUND_NEAREST_AFZ;
-    case HloOpcode::kSign:
-      return UNOP_SIGN;
-    case HloOpcode::kSin:
-      return UNOP_SIN;
-    case HloOpcode::kSort:
-      return UNOP_SORT;
-    case HloOpcode::kTanh:
-      return UNOP_TANH;
-    default:
-      LOG(FATAL) << "Unhandled opcode for conversion to unary operation: "
-                 << opcode;
-  }
-}
-
-// Return the BinaryOperation proto enum value associated with the given HLO
-// opcode.
-BinaryOperation OpcodeToBinaryOperation(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kAtan2:
-      return BINOP_ATAN2;
-    case HloOpcode::kComplex:
-      return BINOP_COMPLEX;
-    case HloOpcode::kMultiply:
-      return BINOP_MUL;
-    case HloOpcode::kAdd:
-      return BINOP_ADD;
-    case HloOpcode::kSubtract:
-      return BINOP_SUB;
-    case HloOpcode::kDivide:
-      return BINOP_DIV;
-    case HloOpcode::kEq:
-      return BINOP_EQ;
-    case HloOpcode::kGe:
-      return BINOP_GE;
-    case HloOpcode::kGt:
-      return BINOP_GT;
-    case HloOpcode::kLe:
-      return BINOP_LE;
-    case HloOpcode::kLt:
-      return BINOP_LT;
-    case HloOpcode::kNe:
-      return BINOP_NE;
-    case HloOpcode::kMaximum:
-      return BINOP_MAX;
-    case HloOpcode::kMinimum:
-      return BINOP_MIN;
-    case HloOpcode::kPower:
-      return BINOP_POW;
-    case HloOpcode::kRemainder:
-      return BINOP_REM;
-    case HloOpcode::kOr:
-      return BINOP_OR;
-    case HloOpcode::kAnd:
-      return BINOP_AND;
-    case HloOpcode::kShiftLeft:
-      return BINOP_SHIFT_LEFT;
-    case HloOpcode::kShiftRightArithmetic:
-      return BINOP_SHIFT_RIGHT_ARITHMETIC;
-    case HloOpcode::kShiftRightLogical:
-      return BINOP_SHIFT_RIGHT_LOGICAL;
-    default:
-      LOG(FATAL) << "unhandled opcode " << opcode;
-  }
-}
-
-// Return the TernaryOperation proto enum value associated with the given HLO
-// opcode.
-TernaryOperation OpcodeToTernaryOperation(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kClamp:
-      return TRIOP_CLAMP;
-    case HloOpcode::kSelect:
-      return TRIOP_SELECT;
-    default:
-      LOG(FATAL) << "unhandled opcode " << opcode;
-  }
-}
-
-// Return the VariadicOperation proto enum value associated with the given HLO
-// opcode.
-VariadicOperation OpcodeToVariadicOperation(HloOpcode opcode) {
-  switch (opcode) {
-    case HloOpcode::kTuple:
-      return VAROP_TUPLE;
-    default:
-      LOG(FATAL) << "unhandled opcode " << opcode;
-  }
-}
-
 // Returns true if no element is present in slice more than once.
 bool AllUnique(tensorflow::gtl::ArraySlice<int64> slice) {
   return std::set<int64>(slice.begin(), slice.end()).size() == slice.size();
@@ -321,84 +198,81 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
     return shape;
   }
 
-  return InferUnaryOpShape(OpcodeToUnaryOperation(opcode), shape);
-}
+  TF_RETURN_IF_ERROR(
+      ExpectNotTupleOrOpaque(shape, "operand of unary operation"));
 
-/* static */ StatusOr<Shape> ShapeInference::InferUnaryOpShape(
-    UnaryOperation operation, const Shape& arg) {
-  TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(arg, "operand of unary operation"));
-
-  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(arg));
-  switch (operation) {
-    case UNOP_FLOOR:
-    case UNOP_CEIL:
-      if (!ShapeUtil::ElementIsFloating(arg)) {
+  TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
+  switch (opcode) {
+    case HloOpcode::kFloor:
+    case HloOpcode::kCeil:
+      if (!ShapeUtil::ElementIsFloating(shape)) {
         return InvalidArgument(
             "Expected element type in shape to be floating for floor/ceil "
             "operation; got %s.",
-            PrimitiveType_Name(arg.element_type()).c_str());
+            PrimitiveType_Name(shape.element_type()).c_str());
       }
-      return arg;
-    case UNOP_COS:
-    case UNOP_SIN:
-    case UNOP_EXP:
-    case UNOP_EXPM1:
-    case UNOP_LOG:
-    case UNOP_LOG1P:
-    case UNOP_TANH:
-      if (!ShapeUtil::ElementIsFloating(arg) &&
-          !ShapeUtil::ElementIsComplex(arg)) {
+      return shape;
+    case HloOpcode::kCos:
+    case HloOpcode::kSin:
+    case HloOpcode::kExp:
+    case HloOpcode::kExpm1:
+    case HloOpcode::kLog:
+    case HloOpcode::kLog1p:
+    case HloOpcode::kTanh:
+      if (!ShapeUtil::ElementIsFloating(shape) &&
+          !ShapeUtil::ElementIsComplex(shape)) {
         return InvalidArgument(
             "Expected element type in shape to be floating or complex for "
             "sin/cos/exp/log/tanh operation; got %s.",
-            PrimitiveType_Name(arg.element_type()).c_str());
+            PrimitiveType_Name(shape.element_type()).c_str());
       }
-      return arg;
-    case UNOP_REAL:
-    case UNOP_IMAG:
-      if (!ShapeUtil::ElementIsComplex(arg)) {
+      return shape;
+    case HloOpcode::kReal:
+    case HloOpcode::kImag:
+      if (!ShapeUtil::ElementIsComplex(shape)) {
         return InvalidArgument(
             "Expected element type in shape to be complex for real/imag "
             "operation; got %s.",
-            PrimitiveType_Name(arg.element_type()).c_str());
+            PrimitiveType_Name(shape.element_type()).c_str());
       }
-      return ShapeUtil::ChangeElementType(arg, F32);
-    case UNOP_ABS:
-      if (ShapeUtil::ElementIsComplex(arg)) {
+      return ShapeUtil::ChangeElementType(shape, F32);
+    case HloOpcode::kAbs:
+      if (ShapeUtil::ElementIsComplex(shape)) {
         return ShapeUtil::ChangeElementType(
-            arg, primitive_util::ComplexComponentType(arg.element_type()));
+            shape, primitive_util::ComplexComponentType(shape.element_type()));
       }
-      return arg;
-    case UNOP_CLZ:
-    case UNOP_NEGATE:
-    case UNOP_ROUND_NEAREST_AFZ:
-    case UNOP_SIGN:
-    case UNOP_SORT:
-      return arg;
-
-    case UNOP_NOT:
-      if (arg.element_type() != PRED &&
-          !primitive_util::IsIntegralType(arg.element_type())) {
+      return shape;
+    case HloOpcode::kClz:
+    case HloOpcode::kNegate:
+    case HloOpcode::kRoundNearestAfz:
+    case HloOpcode::kSign:
+    case HloOpcode::kSort:
+      return shape;
+
+    case HloOpcode::kNot:
+      if (shape.element_type() != PRED &&
+          !primitive_util::IsIntegralType(shape.element_type())) {
         return InvalidArgument(
             "Expected pred or an integral element type in argument to Not "
             "operation; got %s.",
-            PrimitiveType_Name(arg.element_type()).c_str());
+            PrimitiveType_Name(shape.element_type()).c_str());
       }
-      return arg;
+      return shape;
 
-    case UNOP_IS_FINITE:
-      if (!ShapeUtil::ElementIsFloating(arg)) {
+    case HloOpcode::kIsFinite:
+      if (!ShapeUtil::ElementIsFloating(shape)) {
         return InvalidArgument(
-            "Expected element type in shape to be floating point for IsFinite "
+            "Expected element type in shape to be floating "
+            "point for IsFinite "
             "operation; got %s.",
-            PrimitiveType_Name(arg.element_type()).c_str());
+            PrimitiveType_Name(shape.element_type()).c_str());
       }
-      return ShapeUtil::ChangeElementType(arg, PRED);
+      return ShapeUtil::ChangeElementType(shape, PRED);
 
     default:
       return InvalidArgument(
           "Unknown operation for unary shape inference: \"%s\".",
-          UnaryOperation_Name(operation).c_str());
+          HloOpcodeString(opcode).c_str());
   }
 }
 
@@ -779,8 +653,9 @@ Status ValidateDotDimensionNumbers(
 }
 
 /* static */ StatusOr<Shape>
-ShapeInference::InferDegenerateDimensionBroadcastShape(
-    BinaryOperation operation, const Shape& lhs, const Shape& rhs) {
+ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
+                                                       const Shape& lhs,
+                                                       const Shape& rhs) {
   TF_RET_CHECK(ShapeUtil::Rank(lhs) == ShapeUtil::Rank(rhs));
 
   // The shapes have to be compatible. That is, if some dimension d has a
@@ -798,7 +673,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     } else {
       return InvalidArgument(
           "Binary op %s with incompatible shapes: %s and %s.",
-          BinaryOperation_Name(operation).c_str(),
+          HloOpcodeString(operation).c_str(),
           ShapeUtil::HumanString(lhs).c_str(),
           ShapeUtil::HumanString(rhs).c_str());
     }
@@ -808,8 +683,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferInDimBroadcastShape(
-    BinaryOperation operation, const Shape& smaller_shape,
-    const Shape& larger_shape,
+    const Shape& smaller_shape, const Shape& larger_shape,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
   if (broadcast_dimensions.empty() && !ShapeUtil::IsScalar(smaller_shape)) {
     // Reject "magic" inference for binops on different shapes, requiring
@@ -910,7 +784,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferElementwiseBinaryOpShape(
-    BinaryOperation operation, const Shape& lhs, const Shape& rhs,
+    HloOpcode operation, const Shape& lhs, const Shape& rhs,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
   TF_RETURN_IF_ERROR(
       ExpectNotTupleOrOpaque(lhs, "lhs of elementwise binary operation"));
@@ -920,8 +794,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
   if (!ShapeUtil::SameElementTypeIgnoringFpPrecision(lhs, rhs)) {
     return InvalidArgument(
         "Binary op %s with different element types: %s and %s.",
-        BinaryOperation_Name(operation).c_str(),
-        ShapeUtil::HumanString(lhs).c_str(),
+        HloOpcodeString(operation).c_str(), ShapeUtil::HumanString(lhs).c_str(),
         ShapeUtil::HumanString(rhs).c_str());
   }
 
@@ -954,10 +827,9 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         ShapeUtil::Rank(lhs) > ShapeUtil::Rank(rhs) ? rhs : lhs;
 
     // After InDim broadcasting, perform degenerate dimensions broadcasting.
-    TF_ASSIGN_OR_RETURN(
-        Shape indim_broadcast_shape,
-        InferInDimBroadcastShape(operation, smaller_shape, larger_shape,
-                                 broadcast_dimensions));
+    TF_ASSIGN_OR_RETURN(Shape indim_broadcast_shape,
+                        InferInDimBroadcastShape(smaller_shape, larger_shape,
+                                                 broadcast_dimensions));
 
     return InferDegenerateDimensionBroadcastShape(
         operation, indim_broadcast_shape, larger_shape);
@@ -966,51 +838,44 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
 /* static */ StatusOr<Shape> ShapeInference::InferBinaryOpShape(
     HloOpcode opcode, const HloInstruction* lhs, const HloInstruction* rhs) {
-  return InferBinaryOpShape(OpcodeToBinaryOperation(opcode), lhs->shape(),
-                            rhs->shape(), /*broadcast_dimensions=*/{});
+  return InferBinaryOpShape(opcode, lhs->shape(), rhs->shape(),
+                            /*broadcast_dimensions=*/{});
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferBinaryOpShape(
     HloOpcode opcode, const Shape& lhs, const Shape& rhs,
     tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
-  return InferBinaryOpShape(OpcodeToBinaryOperation(opcode), lhs, rhs,
-                            broadcast_dimensions);
-}
-
-/* static */ StatusOr<Shape> ShapeInference::InferBinaryOpShape(
-    BinaryOperation operation, const Shape& lhs, const Shape& rhs,
-    tensorflow::gtl::ArraySlice<int64> broadcast_dimensions) {
   VLOG(2) << tensorflow::strings::Printf(
       "inferring shape for <%s>(%s, %s) with broadcast_dimensions={%s}",
-      BinaryOperation_Name(operation).c_str(),
-      ShapeUtil::HumanString(lhs).c_str(), ShapeUtil::HumanString(rhs).c_str(),
+      HloOpcodeString(opcode).c_str(), ShapeUtil::HumanString(lhs).c_str(),
+      ShapeUtil::HumanString(rhs).c_str(),
       Join(broadcast_dimensions, ", ").c_str());
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
 
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
       lhs, tensorflow::strings::StrCat("lhs of binary operation ",
-                                       BinaryOperation_Name(operation))));
+                                       HloOpcodeString(opcode))));
   TF_RETURN_IF_ERROR(ExpectNotTupleOrOpaque(
       rhs, tensorflow::strings::StrCat("rhs of binary operation ",
-                                       BinaryOperation_Name(operation))));
-  switch (operation) {
-    case BINOP_MAX:
-    case BINOP_MIN:
-    case BINOP_SUB:
-    case BINOP_ADD:
-    case BINOP_ATAN2:
-    case BINOP_POW:
-    case BINOP_DIV:
-    case BINOP_REM:
-    case BINOP_MUL:
-    case BINOP_SHIFT_LEFT:
-    case BINOP_SHIFT_RIGHT_ARITHMETIC:
-    case BINOP_SHIFT_RIGHT_LOGICAL:
-      return InferElementwiseBinaryOpShape(operation, lhs, rhs,
+                                       HloOpcodeString(opcode))));
+  switch (opcode) {
+    case HloOpcode::kMaximum:
+    case HloOpcode::kMinimum:
+    case HloOpcode::kSubtract:
+    case HloOpcode::kAdd:
+    case HloOpcode::kAtan2:
+    case HloOpcode::kPower:
+    case HloOpcode::kDivide:
+    case HloOpcode::kRemainder:
+    case HloOpcode::kMultiply:
+    case HloOpcode::kShiftLeft:
+    case HloOpcode::kShiftRightArithmetic:
+    case HloOpcode::kShiftRightLogical:
+      return InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                            broadcast_dimensions);
 
-    case BINOP_COMPLEX: {
+    case HloOpcode::kComplex: {
       if (!ShapeUtil::ElementIsFloating(lhs)) {
         return InvalidArgument(
             "Expected element type in shape to be floating for complex compose "
@@ -1018,7 +883,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
             PrimitiveType_Name(lhs.element_type()).c_str());
       }
       TF_ASSIGN_OR_RETURN(const Shape& shape,
-                          InferElementwiseBinaryOpShape(operation, lhs, rhs,
+                          InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                                         broadcast_dimensions));
       if (lhs.element_type() == F32 && rhs.element_type() == F32) {
         return ShapeUtil::ChangeElementType(shape, C64);
@@ -1026,8 +891,8 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
         return Unimplemented("Complex component type is not implemented.");
       }
     }
-    case BINOP_AND:
-    case BINOP_OR:
+    case HloOpcode::kAnd:
+    case HloOpcode::kOr:
       if (lhs.element_type() != PRED &&
           !primitive_util::IsIntegralType(lhs.element_type())) {
         return InvalidArgument(
@@ -1035,24 +900,24 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
             "got %s.",
             PrimitiveType_Name(lhs.element_type()).c_str());
       }
-      return InferElementwiseBinaryOpShape(operation, lhs, rhs,
+      return InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                            broadcast_dimensions);
-    case BINOP_EQ:
-    case BINOP_GE:
-    case BINOP_GT:
-    case BINOP_LE:
-    case BINOP_LT:
-    case BINOP_NE: {
+    case HloOpcode::kEq:
+    case HloOpcode::kGe:
+    case HloOpcode::kGt:
+    case HloOpcode::kLe:
+    case HloOpcode::kLt:
+    case HloOpcode::kNe: {
       TF_ASSIGN_OR_RETURN(const Shape& shape,
-                          InferElementwiseBinaryOpShape(operation, lhs, rhs,
+                          InferElementwiseBinaryOpShape(opcode, lhs, rhs,
                                                         broadcast_dimensions));
       return ShapeUtil::ChangeElementType(shape, PRED);
     }
     default:
       return Unimplemented(
           "Binary op shape inference: %s; lhs: %s; rhs: %s is not implemented.",
-          BinaryOperation_Name(operation).c_str(),
-          lhs.ShortDebugString().c_str(), rhs.ShortDebugString().c_str());
+          HloOpcodeString(opcode).c_str(), lhs.ShortDebugString().c_str(),
+          rhs.ShortDebugString().c_str());
   }
 }
 
@@ -1064,23 +929,17 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 
 /* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
     HloOpcode opcode, const Shape& lhs, const Shape& rhs, const Shape& ehs) {
-  return InferTernaryOpShape(OpcodeToTernaryOperation(opcode), lhs, rhs, ehs);
-}
-
-/* static */ StatusOr<Shape> ShapeInference::InferTernaryOpShape(
-    TernaryOperation operation, const Shape& lhs, const Shape& rhs,
-    const Shape& ehs) {
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(lhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(rhs));
   TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(ehs));
-  switch (operation) {
-    case TRIOP_CLAMP:
+  switch (opcode) {
+    case HloOpcode::kClamp:
       return InferClampShape(lhs, rhs, ehs);
-    case TRIOP_SELECT:
+    case HloOpcode::kSelect:
       return InferSelectShape(lhs, rhs, ehs);
     default:
       return InvalidArgument("Unknown operation %s.",
-                             TernaryOperation_Name(operation).c_str());
+                             HloOpcodeString(opcode).c_str());
   }
 }
 
@@ -1097,18 +956,11 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
 /* static */ StatusOr<Shape> ShapeInference::InferVariadicOpShape(
     HloOpcode opcode,
     tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
-  return InferVariadicOpShape(OpcodeToVariadicOperation(opcode),
-                              operand_shapes);
-}
-
-/* static */ StatusOr<Shape> ShapeInference::InferVariadicOpShape(
-    VariadicOperation operation,
-    tensorflow::gtl::ArraySlice<const Shape*> operand_shapes) {
   for (const Shape* shape : operand_shapes) {
     TF_DCHECK_OK(ShapeUtil::ValidateShapeWithOptionalLayout(*shape));
   }
-  switch (operation) {
-    case VAROP_TUPLE: {
+  switch (opcode) {
+    case HloOpcode::kTuple: {
       Shape result = ShapeUtil::MakeTupleShape({});
       for (const Shape* shape : operand_shapes) {
         ShapeUtil::AppendShapeToTuple(*shape, &result);
@@ -1117,7 +969,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(
     }
     default:
       return InvalidArgument("Unknown operation %s.",
-                             VariadicOperation_Name(operation).c_str());
+                             HloOpcodeString(opcode).c_str());
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 6100e2cd33..f1f7b50902 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -46,8 +46,6 @@ class ShapeInference {
  public:
   // Infers the shape produced by applying the given unary operation to the
   // given input shape.
-  static StatusOr<Shape> InferUnaryOpShape(UnaryOperation operation,
-                                           const Shape& arg);
   static StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
                                            const Shape& shape);
   static StatusOr<Shape> InferUnaryOpShape(HloOpcode opcode,
@@ -55,9 +53,6 @@ class ShapeInference {
 
   // Infers the shape produced by applying the given binary operation to the
   // given input shapes.
-  static StatusOr<Shape> InferBinaryOpShape(
-      BinaryOperation operation, const Shape& lhs, const Shape& rhs,
-      tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
   static StatusOr<Shape> InferBinaryOpShape(
       HloOpcode opcode, const Shape& lhs, const Shape& rhs,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
@@ -67,9 +62,6 @@ class ShapeInference {
 
   // Infers the shape produced by applying the given ternary operation to the
   // given input shapes.
-  static StatusOr<Shape> InferTernaryOpShape(TernaryOperation operation,
-                                             const Shape& lhs, const Shape& rhs,
-                                             const Shape& ehs);
   static StatusOr<Shape> InferTernaryOpShape(HloOpcode opcode, const Shape& lhs,
                                              const Shape& rhs,
                                              const Shape& ehs);
@@ -80,9 +72,6 @@ class ShapeInference {
 
   // Infers the shape produced by applying the given variadic operation to the
   // given input operand shapes.
-  static StatusOr<Shape> InferVariadicOpShape(
-      VariadicOperation operation,
-      tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
   static StatusOr<Shape> InferVariadicOpShape(
       HloOpcode opcode,
       tensorflow::gtl::ArraySlice<const Shape*> operand_shapes);
@@ -286,7 +275,7 @@ class ShapeInference {
   // the LHS and a single element in the RHS to produce a single output element,
   // even in the presence of broadcasting of one of the operands over the other.
   static StatusOr<Shape> InferElementwiseBinaryOpShape(
-      BinaryOperation operation, const Shape& lhs, const Shape& rhs,
+      HloOpcode operation, const Shape& lhs, const Shape& rhs,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
   // Helper for inferring the shape of Clamp ops.
@@ -302,7 +291,7 @@ class ShapeInference {
   // dimension broadcasting (a dimension of size 1 in one operand is broadcast
   // up to match the size of the dimension in the other operand).
   static StatusOr<Shape> InferDegenerateDimensionBroadcastShape(
-      BinaryOperation operation, const Shape& lhs, const Shape& rhs);
+      HloOpcode operation, const Shape& lhs, const Shape& rhs);
 
   // Helper for inferring shapes of binary operations using "InDim"
   // broadcasting. This is the broadcasting used in the *InDim binary operations
@@ -310,8 +299,7 @@ class ShapeInference {
   // lower-rank shape than larger_shape. Returns the shape that the
   // smaller_shape is broadcast to.
   static StatusOr<Shape> InferInDimBroadcastShape(
-      BinaryOperation operation, const Shape& smaller_shape,
-      const Shape& larger_shape,
+      const Shape& smaller_shape, const Shape& larger_shape,
       tensorflow::gtl::ArraySlice<int64> broadcast_dimensions);
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeInference);
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 0e61994a78..6d017dffe2 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -101,8 +101,8 @@ class SelectAndScatterShapeInferenceTest : public ShapeInferenceTest {
 
 TEST_F(ShapeInferenceTest, UnaryNegateMatrix) {
   Shape matrix_shape = ShapeUtil::MakeShape(F32, {128, 64});
-  auto inferred_status = ShapeInference::InferUnaryOpShape(
-      UnaryOperation::UNOP_NEGATE, matrix_shape);
+  auto inferred_status =
+      ShapeInference::InferUnaryOpShape(HloOpcode::kNegate, matrix_shape);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_shape, inferred_status.ValueOrDie()));
 }
@@ -110,14 +110,14 @@ TEST_F(ShapeInferenceTest, UnaryNegateMatrix) {
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenTuples) {
   Shape tuple = ShapeUtil::MakeTupleShape({s32_, f32_});
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, pred_, tuple, tuple);
+      HloOpcode::kSelect, pred_, tuple, tuple);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(tuple, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_64_48_);
+      HloOpcode::kSelect, pred_, matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
@@ -125,34 +125,34 @@ TEST_F(ShapeInferenceTest, SelectScalarPredBetweenArrays) {
 TEST_F(ShapeInferenceTest, SelectArrayPredBetweenArrays) {
   auto predarray = ShapeUtil::MakeShape(PRED, {64, 48});
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, predarray, matrix_64_48_, matrix_64_48_);
+      HloOpcode::kSelect, predarray, matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, SelectBadShapes) {
   auto inferred_status_error1 = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, pred_, matrix_64_48_, matrix_32_64_);
+      HloOpcode::kSelect, pred_, matrix_64_48_, matrix_32_64_);
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().error_message(),
               HasSubstr("Operands to select must be the same shape"));
 
   auto inferred_status_error2 = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, s32_, matrix_64_48_, matrix_64_48_);
+      HloOpcode::kSelect, s32_, matrix_64_48_, matrix_64_48_);
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().error_message(),
               HasSubstr("pred operand must have PRED"));
 
   auto inferred_status_error3 = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, ShapeUtil::MakeShape(PRED, {64}),
-      matrix_64_48_, matrix_64_48_);
+      HloOpcode::kSelect, ShapeUtil::MakeShape(PRED, {64}), matrix_64_48_,
+      matrix_64_48_);
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().error_message(),
               HasSubstr("with non-scalar predicate with dimensionality"));
 
   // Tuples have a TUPLE element type and cannot be the pred of a select.
   auto inferred_status_error4 = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_SELECT, ShapeUtil::MakeTupleShape({pred_, pred_}),
+      HloOpcode::kSelect, ShapeUtil::MakeTupleShape({pred_, pred_}),
       ShapeUtil::MakeTupleShape({f32_, f32_}),
       ShapeUtil::MakeTupleShape({f32_, f32_}));
   ASSERT_FALSE(inferred_status_error4.ok());
@@ -162,102 +162,98 @@ TEST_F(ShapeInferenceTest, SelectBadShapes) {
 
 TEST_F(ShapeInferenceTest, ClampAllMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_,
-      matrix_64_48_);
+      HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampAllScalar) {
-  auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, f32_, f32_, f32_);
+  auto inferred_status =
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(f32_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, matrix_64_48_);
+      HloOpcode::kClamp, f32_, matrix_64_48_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, matrix_64_48_, f32_);
+      HloOpcode::kClamp, matrix_64_48_, matrix_64_48_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandScalar) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, matrix_64_48_);
+      HloOpcode::kClamp, matrix_64_48_, f32_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampMinMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, matrix_64_48_, f32_, f32_);
+      HloOpcode::kClamp, matrix_64_48_, f32_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampMaxMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, f32_, f32_, matrix_64_48_);
+      HloOpcode::kClamp, f32_, f32_, matrix_64_48_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampOperandMatrix) {
   auto inferred_status = ShapeInference::InferTernaryOpShape(
-      TernaryOperation::TRIOP_CLAMP, f32_, matrix_64_48_, f32_);
+      HloOpcode::kClamp, f32_, matrix_64_48_, f32_);
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(matrix_64_48_, inferred_status.ValueOrDie()));
 }
 
 TEST_F(ShapeInferenceTest, ClampBadShapes) {
   // Type mismatch
-  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
-                   TernaryOperation::TRIOP_CLAMP, s32_, f32_, f32_)
-                   .ok());
-  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
-                   TernaryOperation::TRIOP_CLAMP, f32_, s32_, f32_)
-                   .ok());
-  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
-                   TernaryOperation::TRIOP_CLAMP, f32_, f32_, s32_)
-                   .ok());
-  // Dimension mismatch
   ASSERT_FALSE(
-      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
-                                          vector_64_, vector_32_, vector_32_)
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, s32_, f32_, f32_)
           .ok());
   ASSERT_FALSE(
-      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
-                                          vector_32_, vector_64_, vector_32_)
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, s32_, f32_)
           .ok());
   ASSERT_FALSE(
-      ShapeInference::InferTernaryOpShape(TernaryOperation::TRIOP_CLAMP,
-                                          vector_32_, vector_32_, vector_64_)
+      ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_, f32_, s32_)
           .ok());
-  // Dimension mismatch, where one operand is a scalar
+  // Dimension mismatch
   ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
-                   TernaryOperation::TRIOP_CLAMP, vector_64_, vector_32_, f32_)
+                   HloOpcode::kClamp, vector_64_, vector_32_, vector_32_)
                    .ok());
   ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
-                   TernaryOperation::TRIOP_CLAMP, vector_64_, f32_, vector_32_)
+                   HloOpcode::kClamp, vector_32_, vector_64_, vector_32_)
                    .ok());
   ASSERT_FALSE(ShapeInference::InferTernaryOpShape(
-                   TernaryOperation::TRIOP_CLAMP, f32_, vector_64_, vector_32_)
+                   HloOpcode::kClamp, vector_32_, vector_32_, vector_64_)
+                   .ok());
+  // Dimension mismatch, where one operand is a scalar
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(HloOpcode::kClamp,
+                                                   vector_64_, vector_32_, f32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(HloOpcode::kClamp,
+                                                   vector_64_, f32_, vector_32_)
+                   .ok());
+  ASSERT_FALSE(ShapeInference::InferTernaryOpShape(HloOpcode::kClamp, f32_,
+                                                   vector_64_, vector_32_)
                    .ok());
 }
 
 TEST_F(ShapeInferenceTest, Complex) {
   auto complex_shape = [&](const Shape& lhs, const Shape& rhs,
                            const tensorflow::gtl::ArraySlice<int64>& bcast) {
-    return ShapeInference::InferBinaryOpShape(BinaryOperation::BINOP_COMPLEX,
-                                              lhs, rhs, bcast);
+    return ShapeInference::InferBinaryOpShape(HloOpcode::kComplex, lhs, rhs,
+                                              bcast);
   };
   // Inputs must be FP.
   ASSERT_FALSE(complex_shape(s32_, s32_, {}).ok());
@@ -292,8 +288,8 @@ TEST_F(ShapeInferenceTest, Complex) {
 }
 
 TEST_F(ShapeInferenceTest, VariadicOpTuplify) {
-  StatusOr<Shape> result = ShapeInference::InferVariadicOpShape(
-      VariadicOperation::VAROP_TUPLE, {&s32_, &f32_});
+  StatusOr<Shape> result =
+      ShapeInference::InferVariadicOpShape(HloOpcode::kTuple, {&s32_, &f32_});
   ASSERT_IS_OK(result.status());
   ASSERT_TRUE(ShapeUtil::Equal(result.ValueOrDie(),
                                ShapeUtil::MakeTupleShape({s32_, f32_})));
@@ -804,8 +800,8 @@ TEST_F(ShapeInferenceTest, InferConstIndexShape) {
 
 TEST_F(ShapeInferenceTest, InferPowShape) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
-  auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_POW, ten_floats, f32_, {});
+  auto inferred_status = ShapeInference::InferBinaryOpShape(
+      HloOpcode::kPower, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ten_floats, inferred_status.ValueOrDie()));
 }
@@ -813,7 +809,7 @@ TEST_F(ShapeInferenceTest, InferPowShape) {
 TEST_F(ShapeInferenceTest, InferCompareShapeEq) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_EQ, ten_floats, f32_, {});
+      ShapeInference::InferBinaryOpShape(HloOpcode::kEq, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -822,7 +818,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeEq) {
 TEST_F(ShapeInferenceTest, InferCompareShapeGe) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_GE, ten_floats, f32_, {});
+      ShapeInference::InferBinaryOpShape(HloOpcode::kGe, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -831,7 +827,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeGe) {
 TEST_F(ShapeInferenceTest, InferCompareShapeGt) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_GT, ten_floats, f32_, {});
+      ShapeInference::InferBinaryOpShape(HloOpcode::kGt, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -840,7 +836,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeGt) {
 TEST_F(ShapeInferenceTest, InferCompareShapeLe) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_LE, ten_floats, f32_, {});
+      ShapeInference::InferBinaryOpShape(HloOpcode::kLe, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -849,7 +845,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeLe) {
 TEST_F(ShapeInferenceTest, InferCompareShapeLt) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_LT, ten_floats, f32_, {});
+      ShapeInference::InferBinaryOpShape(HloOpcode::kLt, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -858,7 +854,7 @@ TEST_F(ShapeInferenceTest, InferCompareShapeLt) {
 TEST_F(ShapeInferenceTest, InferCompareShapeNe) {
   auto ten_floats = ShapeUtil::MakeShape(F32, {10});
   auto inferred_status =
-      ShapeInference::InferBinaryOpShape(BINOP_NE, ten_floats, f32_, {});
+      ShapeInference::InferBinaryOpShape(HloOpcode::kNe, ten_floats, f32_, {});
   ASSERT_IS_OK(inferred_status.status());
   ASSERT_TRUE(ShapeUtil::Equal(ShapeUtil::MakeShape(PRED, {10}),
                                inferred_status.ValueOrDie()));
@@ -1111,22 +1107,22 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastMatrixVector) {
   const Shape vec8 = ShapeUtil::MakeShape(F32, {8});
   const Shape vec16 = ShapeUtil::MakeShape(F32, {16});
 
-  auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, mat, vec8, {1});
+  auto inferred_status_match =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {1});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), mat));
 
-  auto inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, mat, vec8, {0});
+  auto inferred_status_mismatch =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec8, {0});
   ASSERT_FALSE(inferred_status_mismatch.ok());
 
-  inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, mat, vec16, {0});
+  inferred_status_match =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {0});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), mat));
 
-  inferred_status_mismatch = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, mat, vec16, {1});
+  inferred_status_mismatch =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, mat, vec16, {1});
   ASSERT_FALSE(inferred_status_mismatch.ok());
 }
 
@@ -1138,17 +1134,17 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastCubeMatrix) {
   const Shape matrix16_8 = ShapeUtil::MakeShape(F32, {16, 8});
 
   auto inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, cube, matrix8_4, {1, 2});
+      HloOpcode::kAdd, cube, matrix8_4, {1, 2});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), cube));
 
   inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, cube, matrix16_4, {0, 2});
+      HloOpcode::kAdd, cube, matrix16_4, {0, 2});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), cube));
 
   inferred_status_match = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, cube, matrix16_8, {0, 1});
+      HloOpcode::kAdd, cube, matrix16_8, {0, 1});
   ASSERT_IS_OK(inferred_status_match.status());
   ASSERT_TRUE(ShapeUtil::Equal(inferred_status_match.ValueOrDie(), cube));
 }
@@ -1162,43 +1158,43 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   const Shape matrix8_8 = ShapeUtil::MakeShape(F32, {8, 8});
 
   // "magical" broadcast rejected
-  auto inferred_status_error1 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor, vec8, {});
+  auto inferred_status_error1 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {});
   ASSERT_FALSE(inferred_status_error1.ok());
   ASSERT_THAT(inferred_status_error1.status().error_message(),
               HasSubstr("Automatic"));
 
   // broadcast_dimension out of bounds for tensor's rank
-  auto inferred_status_error2 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor, vec8, {3});
+  auto inferred_status_error2 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {3});
   ASSERT_FALSE(inferred_status_error2.ok());
   ASSERT_THAT(inferred_status_error2.status().error_message(),
               ContainsRegex("Broadcast dimension number .* too large"));
 
   // broadcast_dimension doesn't match corresponding dimension
-  auto inferred_status_error3 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor, vec8, {0});
+  auto inferred_status_error3 =
+      ShapeInference::InferBinaryOpShape(HloOpcode::kAdd, tensor, vec8, {0});
   ASSERT_FALSE(inferred_status_error3.ok());
   ASSERT_THAT(inferred_status_error3.status().error_message(),
               HasSubstr("Broadcast dimension 0 mismatch"));
 
   // broadcast_dimensions list too long
   auto inferred_status_error4 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor, matrix8_4, {0, 1, 2});
+      HloOpcode::kAdd, tensor, matrix8_4, {0, 1, 2});
   ASSERT_FALSE(inferred_status_error4.ok());
   ASSERT_THAT(inferred_status_error4.status().error_message(),
               HasSubstr("broadcast_dimensions has to match"));
 
   // there's a dimension above the rank of the tensor
   auto inferred_status_error5 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor, matrix8_4, {3, 0});
+      HloOpcode::kAdd, tensor, matrix8_4, {3, 0});
   ASSERT_FALSE(inferred_status_error5.ok());
   ASSERT_THAT(inferred_status_error5.status().error_message(),
               ContainsRegex("dimension number .* too large"));
 
   // broadcasting dimensions don't match in this order
   auto inferred_status_error6 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor, matrix8_4, {2, 1});
+      HloOpcode::kAdd, tensor, matrix8_4, {2, 1});
   ASSERT_FALSE(inferred_status_error6.ok());
   ASSERT_THAT(inferred_status_error6.status().error_message(),
               HasSubstr("dimension 0 mismatch"));
@@ -1207,13 +1203,13 @@ TEST_F(ShapeInferenceTest, BinOpBroadcastBadDimension) {
   // in a proper (strictly increasing) order, even if the lower-rank array
   // matches the higher-rank array in many different ways.
   auto inferred_status_error7 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {0, 0});
+      HloOpcode::kAdd, tensor8_8_8, matrix8_8, {0, 0});
   ASSERT_FALSE(inferred_status_error7.ok());
   ASSERT_THAT(inferred_status_error7.status().error_message(),
               HasSubstr("dimensions order is wrong"));
 
   auto inferred_status_error8 = ShapeInference::InferBinaryOpShape(
-      BinaryOperation::BINOP_ADD, tensor8_8_8, matrix8_8, {1, 0});
+      HloOpcode::kAdd, tensor8_8_8, matrix8_8, {1, 0});
   ASSERT_FALSE(inferred_status_error8.ok());
   ASSERT_THAT(inferred_status_error8.status().error_message(),
               HasSubstr("dimensions order is wrong"));
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index 34c86e007b..3a0f51fc66 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -671,7 +671,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidInDimensionBroadcasting) {
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().error_message(),
-              HasSubstr("op BINOP_ADD with incompatible shapes"));
+              HasSubstr("op add with incompatible shapes"));
 }
 
 XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
@@ -684,7 +684,7 @@ XLA_TEST_F(BroadcastSimpleTest, InvalidDegenerateBroadcasting) {
   auto result_status = Execute(&b, {});
   EXPECT_FALSE(result_status.ok());
   EXPECT_THAT(result_status.status().error_message(),
-              HasSubstr("op BINOP_ADD with incompatible shapes"));
+              HasSubstr("op add with incompatible shapes"));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/tests/map_test.cc b/tensorflow/compiler/xla/tests/map_test.cc
index 7df45bebeb..3975e91257 100644
--- a/tensorflow/compiler/xla/tests/map_test.cc
+++ b/tensorflow/compiler/xla/tests/map_test.cc
@@ -488,10 +488,9 @@ TEST_F(MapTest, MapOperantionWithBuildError) {
 
   StatusOr<XlaComputation> computation_status = builder.Build();
   ASSERT_TRUE(!computation_status.ok());
-  EXPECT_THAT(
-      computation_status.status().ToString(),
-      ::testing::HasSubstr("error from: ErrorAdd: Binary op BINOP_ADD with "
-                           "different element types: f32[] and u16[]"));
+  EXPECT_THAT(computation_status.status().ToString(),
+              ::testing::HasSubstr("error from: ErrorAdd: Binary op add with "
+                                   "different element types: f32[] and u16[]"));
 }
 
 // MapTest disables inline and algsimp. MapTestWithFullOpt runs all
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 963d3836ed..0af73e8a93 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -484,112 +484,6 @@ message DotDimensionNumbers {
   repeated int64 rhs_batch_dimensions = 4;
 };
 
-enum UnaryOperation {
-  UNOP_INVALID = 0;
-
-  // Elementwise, logical negation on booleans and bitwise negation on ints.
-  UNOP_NOT = 1;
-
-  // Elementwise, computes e^x.
-  UNOP_EXP = 2;
-
-  // Elementwise, computes -x.
-  UNOP_NEGATE = 3;
-
-  // Puts the elements in the operand into sorted order.
-  UNOP_SORT = 4;
-
-  // Elementwise, computes tanh(x).
-  UNOP_TANH = 5;
-
-  // Elementwise, computes the natural logarithm of x.
-  UNOP_LOG = 6;
-
-  // Elementwise, computes the floor of x.
-  UNOP_FLOOR = 7;
-
-  // Elementwise, computes the ceil of x.
-  UNOP_CEIL = 8;
-
-  // Elementwise, computes the abs of x.
-  UNOP_ABS = 9;
-
-  // Elementwise, computes the sign of x.
-  UNOP_SIGN = 10;
-
-  // Elementwise, tests if values are finite (not NaN or inf)
-  UNOP_IS_FINITE = 11;
-
-  // Elementwise, computes the cosine of x.
-  UNOP_COS = 12;
-
-  // Elementwise, computes the sine of x.
-  UNOP_SIN = 13;
-
-  // Elementwise, rounds x to nearest integral value, rounding half-way cases
-  // away from zero.
-  UNOP_ROUND_NEAREST_AFZ = 14;
-
-  // Elementwise, extract real component of complex x.
-  UNOP_REAL = 15;
-
-  // Elementwise, extract real component of complex x.
-  UNOP_IMAG = 16;
-
-  // Elementwise, computes clz(x).
-  UNOP_CLZ = 17;
-
-  // Elementwise, computes exp(x)-1.
-  UNOP_EXPM1 = 18;
-
-  // Elementwise, computes log(x+1).
-  UNOP_LOG1P = 19;
-}
-
-enum BinaryOperation {
-  BINOP_INVALID = 0;
-
-  // Arithmetic operations.
-  BINOP_ADD = 1;
-  BINOP_DIV = 2;
-  BINOP_MUL = 3;
-  BINOP_SUB = 4;
-
-  // Comparison operators.
-  BINOP_EQ = 5;
-  BINOP_GE = 6;
-  BINOP_GT = 7;
-  BINOP_LE = 8;
-  BINOP_LT = 9;
-  BINOP_NE = 10;
-
-  // Element-wise maximum.
-  BINOP_MAX = 14;
-
-  // Element-wise minimum.
-  BINOP_MIN = 15;
-
-  // Raises the left-hand-side to the right-hand-side power.
-  BINOP_POW = 16;
-
-  // Remainder operation.
-  BINOP_REM = 17;
-
-  // Element-wise, logical operators on booleans and bitwise operators on ints.
-  BINOP_AND = 18;
-  BINOP_OR = 19;
-
-  BINOP_SHIFT_LEFT = 20;
-  BINOP_SHIFT_RIGHT_ARITHMETIC = 21;
-  BINOP_SHIFT_RIGHT_LOGICAL = 22;
-
-  // Complex from real, imag.
-  BINOP_COMPLEX = 23;
-
-  // Computes the 4-quadrant arctangent of the y, x input arguments.
-  BINOP_ATAN2 = 24;
-}
-
 enum RandomDistribution {
   RNG_INVALID = 0;
 
@@ -604,26 +498,6 @@ enum RandomDistribution {
   // Next: 4
 }
 
-enum TernaryOperation {
-  TRIOP_INVALID = 0;
-
-  // Given a predicate and two operands, selects operand0 if the predicate is
-  // true and operand1 if the predicate is false.
-  TRIOP_SELECT = 1;
-
-  // Given a min, max and an operand returns the operand if between min and max,
-  // else returns min if operand is less than min or max if operand is greater
-  // than max.
-  TRIOP_CLAMP = 3;
-}
-
-enum VariadicOperation {
-  VAROP_INVALID = 0;
-
-  // Creates a tuple from its operands.
-  VAROP_TUPLE = 1;
-}
-
 message OpSharding {
   enum Type {
     // This sharding is replicated across all devices (implies maximal,
-- 
GitLab


From 53901f9bb9a3965ed5dce65284053b0eb387b0c4 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 17:45:48 -0700
Subject: [PATCH 524/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 199876803

---
 tensorflow/go/op/wrappers.go | 152 +++++++++++++++++------------------
 1 file changed, 76 insertions(+), 76 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index cdfd4b30e6..76db602902 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -2914,6 +2914,82 @@ func GuaranteeConst(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	value: The tensor to split.
+//	size_splits: list containing the sizes of each output tensor along the split
+// dimension. Must sum to the dimension of value along split_dim.
+// Can contain one -1 indicating that dimension is to be inferred.
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//
+//
+// Returns Tensors whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `size_splits[i]`.
+func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "SplitV",
+		Input: []tf.Input{
+			value, size_splits, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("SplitV", err)
+		return
+	}
+	return output
+}
+
+// Splits a tensor into `num_split` tensors along one dimension.
+//
+// Arguments:
+//	axis: 0-D.  The dimension along which to split.  Must be in the range
+// `[-rank(value), rank(value))`.
+//	value: The tensor to split.
+//	num_split: The number of ways to split.  Must evenly divide
+// `value.shape[split_dim]`.
+//
+// Returns They are identically shaped tensors, whose shape matches that of `value`
+// except along `axis`, where their sizes are
+// `values.shape[split_dim] / num_split`.
+func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"num_split": num_split}
+	opspec := tf.OpSpec{
+		Type: "Split",
+		Input: []tf.Input{
+			axis, value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("Split", err)
+		return
+	}
+	return output
+}
+
 // Creates a sequence of numbers.
 //
 // This operation creates a sequence of numbers that begins at `start` and
@@ -30634,79 +30710,3 @@ func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset
 	}
 	return offset
 }
-
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//	value: The tensor to split.
-//	num_split: The number of ways to split.  Must evenly divide
-// `value.shape[split_dim]`.
-//
-// Returns They are identically shaped tensors, whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `values.shape[split_dim] / num_split`.
-func Split(scope *Scope, axis tf.Output, value tf.Output, num_split int64) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "Split",
-		Input: []tf.Input{
-			axis, value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("Split", err)
-		return
-	}
-	return output
-}
-
-// Splits a tensor into `num_split` tensors along one dimension.
-//
-// Arguments:
-//	value: The tensor to split.
-//	size_splits: list containing the sizes of each output tensor along the split
-// dimension. Must sum to the dimension of value along split_dim.
-// Can contain one -1 indicating that dimension is to be inferred.
-//	axis: 0-D.  The dimension along which to split.  Must be in the range
-// `[-rank(value), rank(value))`.
-//
-//
-// Returns Tensors whose shape matches that of `value`
-// except along `axis`, where their sizes are
-// `size_splits[i]`.
-func SplitV(scope *Scope, value tf.Output, size_splits tf.Output, axis tf.Output, num_split int64) (output []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"num_split": num_split}
-	opspec := tf.OpSpec{
-		Type: "SplitV",
-		Input: []tf.Input{
-			value, size_splits, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("SplitV", err)
-		return
-	}
-	return output
-}
-- 
GitLab


From 9070f24ae15a4f589219d4cb9c962b14612c2d8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 18:12:16 -0700
Subject: [PATCH 525/610] Collective Ops Part 8

Enable collective op execution in distibuted mode:

Pass collective_graph_key into graph building and
step execution contexts (MasterSession) where it triggers
allocation of an RpcCollectiveExecutorMgr that becomes
accessible via the WorkerEnv and MasterEnv.

The collective_graph_key is used to synchronize step_ids
(which are otherwise random) between otherwise independent
graph executions that contain collective ops that need
to rendezvous.

All APIs for using collectives are still non-public and
experimental.

PiperOrigin-RevId: 199879087
---
 .../common_runtime/build_graph_options.cc     |   3 +
 .../core/common_runtime/build_graph_options.h |   3 +
 .../common_runtime/collective_executor_mgr.cc |  18 ++-
 .../common_runtime/collective_executor_mgr.h  |   9 +-
 .../collective_executor_mgr_test.cc           |  11 +-
 .../collective_param_resolver_local.h         |   2 +-
 .../core/common_runtime/direct_session.cc     |  10 +-
 tensorflow/core/distributed_runtime/BUILD     |  50 ++++++
 .../distributed_runtime/cancellable_call.h    |  65 ++++++++
 .../collective_param_resolver_distributed.cc  |  48 +-----
 ...lective_param_resolver_distributed_test.cc |   7 +-
 .../collective_rma_distributed.cc             |  42 +-----
 .../core/distributed_runtime/graph_mgr.cc     |  26 +++-
 .../core/distributed_runtime/graph_mgr.h      |   8 +-
 .../core/distributed_runtime/master_env.h     |   5 +
 .../distributed_runtime/master_session.cc     |  78 ++++++++--
 .../core/distributed_runtime/master_session.h |   3 +
 tensorflow/core/distributed_runtime/rpc/BUILD |   3 +
 .../rpc/eager/eager_grpc_server_lib.h         |   2 +-
 .../rpc/grpc_server_lib.cc                    |  39 ++++-
 .../distributed_runtime/rpc/grpc_server_lib.h |  11 +-
 .../rpc_collective_executor_mgr.cc            | 142 ++++++++++++++++++
 .../rpc_collective_executor_mgr.h             |  79 ++++++++++
 .../rpc_collective_executor_mgr_test.cc       | 124 +++++++++++++++
 tensorflow/core/distributed_runtime/worker.cc |  10 +-
 25 files changed, 659 insertions(+), 139 deletions(-)
 create mode 100644 tensorflow/core/distributed_runtime/cancellable_call.h
 create mode 100644 tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
 create mode 100644 tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
 create mode 100644 tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc

diff --git a/tensorflow/core/common_runtime/build_graph_options.cc b/tensorflow/core/common_runtime/build_graph_options.cc
index a9dc6ca6cd..00f7a8e645 100644
--- a/tensorflow/core/common_runtime/build_graph_options.cc
+++ b/tensorflow/core/common_runtime/build_graph_options.cc
@@ -32,6 +32,9 @@ string BuildGraphOptions::DebugString() const {
   for (auto& s : callable_options.target()) {
     strings::StrAppend(&rv, s, ", ");
   }
+  if (collective_graph_key != kNoCollectiveGraphKey) {
+    strings::StrAppend(&rv, "\ncollective_graph_key: ", collective_graph_key);
+  }
   return rv;
 }
 
diff --git a/tensorflow/core/common_runtime/build_graph_options.h b/tensorflow/core/common_runtime/build_graph_options.h
index 5ca170e922..3d0f242ea5 100644
--- a/tensorflow/core/common_runtime/build_graph_options.h
+++ b/tensorflow/core/common_runtime/build_graph_options.h
@@ -31,6 +31,9 @@ struct BuildGraphOptions {
   // TODO(mrry): Remove this when the distributed runtime supports Arg/Retval.
   bool use_function_convention = false;
 
+  static const int64 kNoCollectiveGraphKey = 0;
+  int64 collective_graph_key = kNoCollectiveGraphKey;
+
   string DebugString() const;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.cc b/tensorflow/core/common_runtime/collective_executor_mgr.cc
index e07829b286..4f03a5e13a 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.cc
@@ -25,11 +25,11 @@ namespace tensorflow {
 
 CollectiveExecutorMgr::CollectiveExecutorMgr(
     const ConfigProto& config, const DeviceMgr* dev_mgr,
-    DeviceResolverInterface* dev_resolver,
-    ParamResolverInterface* param_resolver)
+    std::unique_ptr<DeviceResolverInterface> dev_resolver,
+    std::unique_ptr<ParamResolverInterface> param_resolver)
     : dev_mgr_(dev_mgr),
-      dev_resolver_(dev_resolver),
-      param_resolver_(param_resolver) {}
+      dev_resolver_(std::move(dev_resolver)),
+      param_resolver_(std::move(param_resolver)) {}
 
 CollectiveExecutorMgr::~CollectiveExecutorMgr() {
   for (auto iter : executor_table_) {
@@ -45,9 +45,7 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
     if (it != executor_table_.end()) {
       ce = it->second;
     } else {
-      CollectiveRemoteAccessLocal* rma = new CollectiveRemoteAccessLocal(
-          dev_mgr_, dev_resolver_.get(), step_id);
-      ce = new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
+      ce = Create(step_id);
       executor_table_[step_id] = ce;
     }
     ce->Ref();
@@ -55,6 +53,12 @@ CollectiveExecutor* CollectiveExecutorMgr::FindOrCreate(int64 step_id) {
   return ce;
 }
 
+CollectiveExecutor* CollectiveExecutorMgr::Create(int64 step_id) {
+  CollectiveRemoteAccessLocal* rma =
+      new CollectiveRemoteAccessLocal(dev_mgr_, dev_resolver_.get(), step_id);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
+}
+
 void CollectiveExecutorMgr::Cleanup(int64 step_id) {
   CollectiveExecutor* ce = nullptr;
   {
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr.h b/tensorflow/core/common_runtime/collective_executor_mgr.h
index 4b42e2b4d1..9de6ab8968 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr.h
+++ b/tensorflow/core/common_runtime/collective_executor_mgr.h
@@ -25,8 +25,8 @@ class DeviceMgr;
 class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
  public:
   CollectiveExecutorMgr(const ConfigProto& config, const DeviceMgr* dev_mgr,
-                        DeviceResolverInterface* dev_resolver,
-                        ParamResolverInterface* param_resolver);
+                        std::unique_ptr<DeviceResolverInterface> dev_resolver,
+                        std::unique_ptr<ParamResolverInterface> param_resolver);
 
   virtual ~CollectiveExecutorMgr();
 
@@ -56,11 +56,16 @@ class CollectiveExecutorMgr : public CollectiveExecutorMgrInterface {
   void RetireStepId(int64 graph_key, int64 step_id) override {}
 
  protected:
+  // Called by FindOrCreate when table entry does not yet exist.
+  virtual CollectiveExecutor* Create(int64 step_id);
+
   const DeviceMgr* dev_mgr_;
   std::unique_ptr<DeviceResolverInterface> dev_resolver_;
   std::unique_ptr<ParamResolverInterface> param_resolver_;
   CollectiveRemoteAccess* remote_access_;
   string task_name_;
+
+ private:
   mutex exec_mu_;
   // Map from step_id to CollectiveExecutor
   gtl::FlatMap<int64, CollectiveExecutor*> executor_table_ GUARDED_BY(exec_mu_);
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index 34c9163d6a..91994c5731 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -40,10 +40,13 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     device_count->insert({"CPU", NUM_DEVS});
     TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
     device_mgr_.reset(new DeviceMgr(devices_));
-    DeviceResolverLocal* drl = new DeviceResolverLocal(device_mgr_.get());
-    cme_.reset(new CollectiveExecutorMgr(
-        cp, device_mgr_.get(), drl,
-        new CollectiveParamResolverLocal(device_mgr_.get(), drl, task_name)));
+    std::unique_ptr<DeviceResolverInterface> drl(
+        new DeviceResolverLocal(device_mgr_.get()));
+    std::unique_ptr<ParamResolverInterface> prl(
+        new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+                                         task_name));
+    cme_.reset(new CollectiveExecutorMgr(cp, device_mgr_.get(), std::move(drl),
+                                         std::move(prl)));
   }
 
   std::unique_ptr<CollectiveExecutorMgr> cme_;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.h b/tensorflow/core/common_runtime/collective_param_resolver_local.h
index 3a871f962d..43c404f2ec 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.h
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.h
@@ -201,7 +201,7 @@ class CollectiveParamResolverLocal : public ParamResolverInterface {
       LOCKS_EXCLUDED(irec->out_mu);
 
   const DeviceMgr* dev_mgr_;
-  DeviceResolverInterface* dev_resolver_;
+  DeviceResolverInterface* dev_resolver_;  // Not owned.
   string task_name_;
   mutex group_mu_;
   gtl::FlatMap<int32, std::unique_ptr<GroupRec>> group_table_
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 07c1eafedc..5cef93c605 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -450,11 +450,13 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   // Set up for collectives if the RunOption declares a key.
   if (run_options.experimental().collective_graph_key() > 0) {
     if (!collective_executor_mgr_) {
-      DeviceResolverLocal* drl = new DeviceResolverLocal(device_mgr_.get());
+      std::unique_ptr<DeviceResolverInterface> drl(
+          new DeviceResolverLocal(device_mgr_.get()));
+      std::unique_ptr<ParamResolverInterface> cprl(
+          new CollectiveParamResolverLocal(device_mgr_.get(), drl.get(),
+                                           "/job:localhost/replica:0/task:0"));
       collective_executor_mgr_.reset(new CollectiveExecutorMgr(
-          options_.config, device_mgr_.get(), drl,
-          new CollectiveParamResolverLocal(device_mgr_.get(), drl,
-                                           "/job:localhost/replica:0/task:0")));
+          options_.config, device_mgr_.get(), std::move(drl), std::move(cprl)));
     }
     run_state.collective_executor.reset(new CollectiveExecutor::Handle(
         collective_executor_mgr_->FindOrCreate(step_id), true /*inherit_ref*/));
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index ead698d787..9032823e17 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -145,9 +145,11 @@ tf_cc_test(
     deps = [
         ":session_mgr",
         ":worker_env",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
     ],
 )
@@ -226,6 +228,17 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "cancellable_call",
+    hdrs = ["cancellable_call.h"],
+    deps = [
+        ":call_options",
+        ":worker_cache",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 tf_cc_test(
     name = "tensor_coding_test",
     size = "small",
@@ -392,6 +405,7 @@ cc_library(
     hdrs = ["master_env.h"],
     deps = [
         ":worker_cache",
+        "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
     ],
@@ -452,11 +466,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "rpc_collective_executor_mgr",
+    srcs = ["rpc_collective_executor_mgr.cc"],
+    hdrs = ["rpc_collective_executor_mgr.h"],
+    deps = [
+        ":base_rendezvous_mgr",
+        ":collective_param_resolver_distributed",
+        ":collective_rma_distributed",
+        ":device_resolver_distributed",
+        ":worker_cache",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:worker_proto_cc",
+    ],
+)
+
+tf_cc_test(
+    name = "rpc_collective_executor_mgr_test",
+    srcs = ["rpc_collective_executor_mgr_test.cc"],
+    deps = [
+        ":collective_param_resolver_distributed",
+        ":device_resolver_distributed",
+        ":rpc_collective_executor_mgr",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:session_options",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "collective_rma_distributed",
     srcs = ["collective_rma_distributed.cc"],
     hdrs = ["collective_rma_distributed.h"],
     deps = [
+        ":cancellable_call",
         ":worker_cache",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -492,6 +541,7 @@ cc_library(
     hdrs = ["collective_param_resolver_distributed.h"],
     deps = [
         ":call_options",
+        ":cancellable_call",
         ":device_resolver_distributed",
         ":worker_cache",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/core/distributed_runtime/cancellable_call.h b/tensorflow/core/distributed_runtime/cancellable_call.h
new file mode 100644
index 0000000000..05089c7d15
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/cancellable_call.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CANCELLABLE_CALL_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CANCELLABLE_CALL_H_
+
+#include <string>
+#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+// Supports client side cancellation of WorkerInterface calls via
+// registration with a CancellationManager.
+class CancellableCall {
+ public:
+  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
+                  WorkerCacheInterface* wc)
+      : cancel_mgr_(cancel_mgr),
+        remote_worker_(remote_worker),
+        wc_(wc),
+        wi_(wc_->CreateWorker(remote_worker_)) {}
+
+  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
+
+  virtual void IssueCall(const StatusCallback& done) = 0;
+
+  void Start(const StatusCallback& done) {
+    CancellationToken token = cancel_mgr_->get_cancellation_token();
+    const bool not_yet_cancelled = cancel_mgr_->RegisterCallback(
+        token, [this, token]() { opts_.StartCancel(); });
+    if (not_yet_cancelled) {
+      IssueCall([this, token, done](const Status& s) {
+        cancel_mgr_->DeregisterCallback(token);
+        done(s);
+      });
+    } else {
+      done(errors::Cancelled("RPC Request was cancelled"));
+    }
+  }
+
+ protected:
+  mutable mutex mu_;
+  CancellationManager* const cancel_mgr_;  // Not owned
+  const string remote_worker_;
+  WorkerCacheInterface* const wc_;  // Not owned
+  WorkerInterface* const wi_;       // Owned by wc_, must be released.
+  CallOptions opts_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_CANCELLABLE_CALL_H_
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index 7a93b54eae..612ac14e22 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -14,55 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
 
-#include "tensorflow/core/distributed_runtime/call_options.h"
+#include "tensorflow/core/distributed_runtime/cancellable_call.h"
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
-// TODO(tucker): When we're ready to enable collectives this const will
-// transition to a settable config member.
-static const char FLAGS_collective_group_leader[] =
-    "/job:worker/replica:0/task:0";
-
 namespace tensorflow {
 namespace {
-// Supports client side cancellation of WorkerInterface calls via
-// registration with a CancellationManager.  Note that ParamResolverInterface
-// calls are done on behalf of an Op execution which needs to abort if the
-// step in which it executes is cancelled.
-class CancellableCall {
- public:
-  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
-                  WorkerCacheInterface* wc)
-      : cancel_mgr_(cancel_mgr), remote_worker_(remote_worker), wc_(wc) {
-    wi_ = wc_->CreateWorker(remote_worker_);
-  }
-  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
-
-  virtual void IssueCall(const StatusCallback& done) = 0;
-
-  void Start(const StatusCallback& done) {
-    CancellationToken token = cancel_mgr_->get_cancellation_token();
-    const bool not_yet_cancelled = cancel_mgr_->RegisterCallback(
-        token, [this, token]() { opts_.StartCancel(); });
-    if (not_yet_cancelled) {
-      IssueCall([this, token, done](const Status& s) {
-        cancel_mgr_->DeregisterCallback(token);
-        done(s);
-      });
-    } else {
-      done(errors::Cancelled("RPC Request was cancelled"));
-    }
-  }
-
- protected:
-  mutable mutex mu_;
-  CancellationManager* cancel_mgr_;  // Not owned
-  const string remote_worker_;
-  WorkerCacheInterface* wc_;  // Not owned
-  WorkerInterface* wi_;       // Owned by wc_, must be released.
-  CallOptions opts_;
-};
 
 class CompleteGroupCall : public CancellableCall {
  public:
@@ -126,9 +84,9 @@ CollectiveParamResolverDistributed::CollectiveParamResolverDistributed(
     const string& task_name)
     : CollectiveParamResolverLocal(dev_mgr, dev_resolver, task_name),
       worker_cache_(worker_cache),
-      group_leader_(task_name == FLAGS_collective_group_leader
+      group_leader_(task_name == config.experimental().collective_group_leader()
                         ? ""
-                        : FLAGS_collective_group_leader) {}
+                        : config.experimental().collective_group_leader()) {}
 
 void CollectiveParamResolverDistributed::CompleteParamsAsync(
     const string& device, CollectiveParams* cp, CancellationManager* cancel_mgr,
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 95a010286d..4eed856759 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -147,10 +147,9 @@ class DeviceResDistTest : public ::testing::Test {
     ConfigProto config;
     for (int w = 0; w < num_workers; ++w) {
       string name = strings::StrCat("/job:worker/replica:0/task:", w);
-      // TODO(tucker): When config option becomes available, set here.
-      // if (w == 0) {
-      //   config.set_collective_group_leader(name);
-      // }
+      if (w == 0) {
+        config.mutable_experimental()->set_collective_group_leader(name);
+      }
       DefineWorker(config, name, device_type, num_devices);
     }
   }
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index c15878bfd3..d4c47cab49 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/cancellable_call.h"
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/platform/protobuf_internal.h"
 #include "tensorflow/core/protobuf/transport_options.pb.h"
@@ -28,45 +29,6 @@ namespace tensorflow {
 
 namespace {
 
-// Supports client side cancellation of WorkerInterface calls via
-// registration with a CancellationManager.
-//
-// TODO(tucker): Maybe unify this with CancellableCall in
-// collective_param_resolver_distributed.cc.
-class CancellableCall {
- public:
-  CancellableCall(CancellationManager* cancel_mgr, const string& remote_worker,
-                  WorkerCacheInterface* wc)
-      : cancel_mgr_(cancel_mgr), remote_worker_(remote_worker), wc_(wc) {
-    wi_ = wc_->CreateWorker(remote_worker_);
-  }
-  virtual ~CancellableCall() { wc_->ReleaseWorker(remote_worker_, wi_); }
-
-  virtual void IssueCall(const StatusCallback& done) = 0;
-
-  void Start(const StatusCallback& done) {
-    CancellationToken token = cancel_mgr_->get_cancellation_token();
-    const bool not_yet_cancelled = cancel_mgr_->RegisterCallback(
-        token, [this, token]() { opts_.StartCancel(); });
-    if (not_yet_cancelled) {
-      IssueCall([this, token, done](const Status& s) {
-        cancel_mgr_->DeregisterCallback(token);
-        done(s);
-      });
-    } else {
-      done(errors::Cancelled("RPC Request was cancelled"));
-    }
-  }
-
- protected:
-  mutable mutex mu_;
-  CancellationManager* cancel_mgr_;  // Not owned
-  const string remote_worker_;
-  WorkerCacheInterface* wc_;  // Not owned
-  WorkerInterface* wi_;       // Owned by wc_, must be released.
-  CallOptions opts_;
-};
-
 class RecvBufCall : public CancellableCall {
  public:
   RecvBufCall(int64 step_id, const string& peer_device, const string& peer_task,
@@ -119,7 +81,7 @@ void CollectiveRemoteAccessDistributed::RecvFromPeer(
   };
   State* state = new State;
 
-  // Logic to be executed on the RecvBufferAsync callback.
+  // Logic to be executed on the RecvBufAsync callback.
   auto recv_buf_callback = [this, state, peer_task, to_device, to_alloc_attr,
                             to_device_ctx, to_tensor, done](const Status& s) {
     if (s.ok()) {
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index 8447c55bf4..e2f13df19f 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <vector>
 
+#include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -30,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -118,9 +120,11 @@ Status GraphMgr::DecorateAndPublishGraphForDebug(
 Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
                           const GraphOptions& graph_options,
                           const DebugOptions& debug_options,
+                          int64 collective_graph_key,
                           DistributedFunctionLibraryRuntime* cluster_flr,
                           Item* item) {
   item->session = session;
+  item->collective_graph_key = collective_graph_key;
   item->lib_def.reset(
       new FunctionLibraryDefinition(OpRegistry::Global(), gdef.library()));
 
@@ -280,11 +284,12 @@ Status GraphMgr::InitItem(const string& session, const GraphDef& gdef,
 Status GraphMgr::Register(const string& session, const GraphDef& gdef,
                           const GraphOptions& graph_options,
                           const DebugOptions& debug_options,
+                          int64 collective_graph_key,
                           DistributedFunctionLibraryRuntime* cluster_flr,
                           string* handle) {
   Item* item = new Item;
-  Status s =
-      InitItem(session, gdef, graph_options, debug_options, cluster_flr, item);
+  Status s = InitItem(session, gdef, graph_options, debug_options,
+                      collective_graph_key, cluster_flr, item);
   if (!s.ok()) {
     item->Unref();
     return s;
@@ -415,7 +420,12 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
 
   RemoteRendezvous* rendezvous = worker_env_->rendezvous_mgr->Find(step_id);
   Status s = rendezvous->Initialize(session);
-
+  CollectiveExecutor::Handle* ce_handle =
+      item->collective_graph_key != BuildGraphOptions::kNoCollectiveGraphKey
+          ? new CollectiveExecutor::Handle(
+                worker_env_->collective_executor_mgr->FindOrCreate(step_id),
+                true)
+          : nullptr;
   // Sends values specified by the caller.
   if (s.ok()) {
     std::vector<string> keys;
@@ -431,22 +441,25 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
 
   if (!s.ok()) {
     done(s);
+    delete ce_handle;
     item->Unref();
     rendezvous->Unref();
     return;
   }
 
-  StartParallelExecutors(handle, step_id, item, rendezvous, collector,
-                         cost_graph, cancellation_manager,
-                         [item, rendezvous, done](const Status& s) {
+  StartParallelExecutors(handle, step_id, item, rendezvous, ce_handle,
+                         collector, cost_graph, cancellation_manager,
+                         [item, rendezvous, ce_handle, done](const Status& s) {
                            done(s);
                            rendezvous->Unref();
                            item->Unref();
+                           delete ce_handle;
                          });
 }
 
 void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
                                       Item* item, Rendezvous* rendezvous,
+                                      CollectiveExecutor::Handle* ce_handle,
                                       StepStatsCollector* collector,
                                       CostGraphDef* cost_graph,
                                       CancellationManager* cancellation_manager,
@@ -471,6 +484,7 @@ void GraphMgr::StartParallelExecutors(const string& handle, int64 step_id,
     args.step_id = ++next_id_;
   }
   args.rendezvous = rendezvous;
+  args.collective_executor = ce_handle ? ce_handle->get() : nullptr;
   args.cancellation_manager = cancellation_manager;
   args.stats_collector = collector;
   args.step_container = step_container;
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.h b/tensorflow/core/distributed_runtime/graph_mgr.h
index cc35264b8f..5196046c19 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.h
+++ b/tensorflow/core/distributed_runtime/graph_mgr.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/message_wrappers.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/lib/core/refcount.h"
@@ -75,7 +76,7 @@ class GraphMgr {
   // reference to cluster_flr to do cross process function calls.
   Status Register(const string& session, const GraphDef& gdef,
                   const GraphOptions& graph_options,
-                  const DebugOptions& debug_options,
+                  const DebugOptions& debug_options, int64 collective_graph_key,
                   DistributedFunctionLibraryRuntime* cluster_flr,
                   string* handle);
 
@@ -138,6 +139,8 @@ class GraphMgr {
     // Used to deregister a cost model when cost model is required in graph
     // manager.
     GraphMgr* graph_mgr;
+
+    int64 collective_graph_key;
   };
 
   const WorkerEnv* worker_env_;  // Not owned.
@@ -161,6 +164,7 @@ class GraphMgr {
 
   void StartParallelExecutors(const string& handle, int64 step_id, Item* item,
                               Rendezvous* rendezvous,
+                              CollectiveExecutor::Handle* ce_handle,
                               StepStatsCollector* collector,
                               CostGraphDef* cost_graph,
                               CancellationManager* cancellation_manager,
@@ -175,7 +179,7 @@ class GraphMgr {
 
   Status InitItem(const string& session, const GraphDef& gdef,
                   const GraphOptions& graph_options,
-                  const DebugOptions& debug_options,
+                  const DebugOptions& debug_options, int64 collective_graph_key,
                   DistributedFunctionLibraryRuntime* cluster_flr, Item* item);
 
   Status DecorateAndPublishGraphForDebug(const DebugOptions& debug_options,
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index 16f4d93c8b..da26c42aca 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 namespace tensorflow {
 
+class CollectiveExecutorMgrInterface;
 class Device;
 class DeviceSet;
 class Env;
@@ -90,6 +91,10 @@ struct MasterEnv {
   std::function<Status(const WorkerCacheFactoryOptions&,
                        WorkerCacheInterface**)>
       worker_cache_factory;
+
+  // Generates per-step CollectiveExecutors and has access to utilities
+  // supporting collective operations.
+  CollectiveExecutorMgrInterface* collective_executor_mgr = nullptr;
 };
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index e29bb76ddf..d34ca53f73 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_cache.h"
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/cost_graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -69,6 +70,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
                     bool is_partial, WorkerCacheInterface* worker_cache,
                     bool should_deregister)
       : session_handle_(handle),
+        bg_opts_(bopts),
         client_graph_(std::move(cg)),
         session_opts_(session_opts),
         is_partial_(is_partial),
@@ -100,6 +102,8 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
   const CallableOptions& callable_options() { return callable_opts_; }
 
+  const BuildGraphOptions& build_graph_options() { return bg_opts_; }
+
   std::unique_ptr<ProfileHandler> GetProfileHandler(uint64 step,
                                                     int64 execution_count,
                                                     const RunOptions& ropts) {
@@ -225,6 +229,7 @@ class MasterSession::ReffedClientGraph : public core::RefCounted {
 
  private:
   const string session_handle_;
+  const BuildGraphOptions bg_opts_;
   const std::unique_ptr<ClientGraph> client_graph_;
   const SessionOptions session_opts_;
   const bool is_partial_;
@@ -444,6 +449,7 @@ Status MasterSession::ReffedClientGraph::DoRegisterPartitions(
     *c->req.mutable_graph_options() = session_opts_.config.graph_options();
     *c->req.mutable_debug_options() =
         callable_opts_.run_options().debug_options();
+    c->req.set_collective_graph_key(bg_opts_.collective_graph_key);
     VLOG(2) << "Register " << c->req.graph_def().DebugString();
     auto cb = [c, &done](const Status& s) {
       c->status = s;
@@ -1065,6 +1071,9 @@ void BuildBuildGraphOptions(const RunStepRequestWrapper& req,
     *callable_opts->mutable_run_options()->mutable_debug_options() =
         req.options().debug_options();
   }
+
+  opts->collective_graph_key =
+      req.options().experimental().collective_graph_key();
 }
 
 void BuildBuildGraphOptions(const PartialRunSetupRequest& req,
@@ -1102,6 +1111,10 @@ uint64 HashBuildGraphOptions(const BuildGraphOptions& opts) {
     h = Hash64(watch_summary.c_str(), watch_summary.size(), h);
   }
 
+  if (opts.collective_graph_key != BuildGraphOptions::kNoCollectiveGraphKey) {
+    h = Hash64Combine(opts.collective_graph_key, h);
+  }
+
   return h;
 }
 
@@ -1118,6 +1131,9 @@ string BuildGraphOptionsString(const BuildGraphOptions& opts) {
   for (const string& name : opts.callable_options.fetch()) {
     strings::StrAppend(&buf, " FeE: ", name);
   }
+  if (opts.collective_graph_key != BuildGraphOptions::kNoCollectiveGraphKey) {
+    strings::StrAppend(&buf, "\nGK: ", opts.collective_graph_key);
+  }
   strings::StrAppend(&buf, "\n");
   return buf;
 }
@@ -1430,11 +1446,35 @@ void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
   rcg_map->clear();
 }
 
-namespace {
-uint64 MakeStepId() {
-  return (random::New64() & ((1uLL << 56) - 1)) | (1uLL << 56);
+uint64 MasterSession::NewStepId(int64 graph_key) {
+  if (graph_key == BuildGraphOptions::kNoCollectiveGraphKey) {
+    // StepId must leave the most-significant 7 bits empty for future use.
+    return random::New64() & (((1uLL << 56) - 1) | (1uLL << 56));
+  } else {
+    uint64 step_id = env_->collective_executor_mgr->NextStepId(graph_key);
+    int32 retry_count = 0;
+    while (step_id == CollectiveExecutor::kInvalidId) {
+      Notification note;
+      Status status;
+      env_->collective_executor_mgr->RefreshStepIdSequenceAsync(
+          graph_key, [&status, &note](const Status& s) {
+            status = s;
+            note.Notify();
+          });
+      note.WaitForNotification();
+      if (!status.ok()) {
+        LOG(ERROR) << "Bad status from "
+                      "collective_executor_mgr->RefreshStepIdSequence: "
+                   << status << ".  Retrying.";
+        int64 delay_micros = std::min(60000000LL, 1000000LL * ++retry_count);
+        Env::Default()->SleepForMicroseconds(delay_micros);
+      } else {
+        step_id = env_->collective_executor_mgr->NextStepId(graph_key);
+      }
+    }
+    return step_id;
+  }
 }
-}  // namespace
 
 Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
                                       PartialRunSetupResponse* resp) {
@@ -1456,15 +1496,13 @@ Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
   // Prepare.
   BuildGraphOptions opts;
   BuildBuildGraphOptions(*req, &opts);
-  int64 count;
+  int64 count = 0;
   TF_RETURN_IF_ERROR(StartStep(opts, true, &rcg, &count));
-  // Keeps the highest 8 bits 0x01: we reserve some bits of the
-  // step_id for future use.
-  const uint64 step_id = MakeStepId();
-  TRACEPRINTF("stepid %llu", step_id);
 
   rcg->Ref();
-  RunState* run_state = new RunState(inputs, outputs, rcg, step_id, count);
+  RunState* run_state =
+      new RunState(inputs, outputs, rcg,
+                   NewStepId(BuildGraphOptions::kNoCollectiveGraphKey), count);
   {
     mutex_lock l(mu_);
     partial_runs_.emplace(
@@ -1566,6 +1604,13 @@ Status MasterSession::DoPartialRun(CallOptions* opts,
     }
     run_state = it->second.get();
   }
+  // CollectiveOps are not supported in partial runs.
+  if (req.options().experimental().collective_graph_key() !=
+      BuildGraphOptions::kNoCollectiveGraphKey) {
+    return errors::InvalidArgument(
+        "PartialRun does not support Collective ops.  collective_graph_key "
+        "must be kNoCollectiveGraphKey.");
+  }
 
   // If this is the first partial run, initialize the PerStepState.
   if (!run_state->step_started) {
@@ -1743,7 +1788,11 @@ Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
   Status s = run_status;
   if (s.ok()) {
     pss->end_micros = Env::Default()->NowMicros();
-
+    if (rcg->build_graph_options().collective_graph_key !=
+        BuildGraphOptions::kNoCollectiveGraphKey) {
+      env_->collective_executor_mgr->RetireStepId(
+          rcg->build_graph_options().collective_graph_key, step_id);
+    }
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
   } else if (errors::IsCancelled(s)) {
@@ -1801,7 +1850,7 @@ Status MasterSession::DoRunWithLocalExecution(
 
   // Keeps the highest 8 bits 0x01: we reserve some bits of the
   // step_id for future use.
-  const uint64 step_id = MakeStepId();
+  uint64 step_id = NewStepId(bgopts.collective_graph_key);
   TRACEPRINTF("stepid %llu", step_id);
 
   std::unique_ptr<ProfileHandler> ph;
@@ -1865,9 +1914,8 @@ Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
   // Prepare.
   int64 count = rcg->get_and_increment_execution_count();
 
-  // Keeps the highest 8 bits 0x01: we reserve some bits of the
-  // step_id for future use.
-  const uint64 step_id = MakeStepId();
+  const uint64 step_id =
+      NewStepId(rcg->build_graph_options().collective_graph_key);
   TRACEPRINTF("stepid %llu", step_id);
 
   const RunOptions& run_options = rcg->callable_options().run_options();
diff --git a/tensorflow/core/distributed_runtime/master_session.h b/tensorflow/core/distributed_runtime/master_session.h
index ec34e20b79..449a6d3e3c 100644
--- a/tensorflow/core/distributed_runtime/master_session.h
+++ b/tensorflow/core/distributed_runtime/master_session.h
@@ -141,6 +141,8 @@ class MasterSession : public core::RefCounted {
 
   std::atomic<int64> partial_run_handle_counter_ = {0};
 
+  uint64 NewStepId(int64 graph_key);
+
   mutex mu_;
   std::unique_ptr<GraphExecutionState> execution_state_ GUARDED_BY(mu_);
   int64 graph_version_;
@@ -175,6 +177,7 @@ class MasterSession : public core::RefCounted {
     std::unordered_map<string, bool> pending_outputs;  // true if fetched
     ReffedClientGraph* rcg = nullptr;
     uint64 step_id;
+    int64 collective_graph_key;
     int64 count = 0;
     PerStepState pss;
     std::unique_ptr<ProfileHandler> ph;
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 4b2747f26d..2eadfcde54 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -274,11 +274,14 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
+        "//tensorflow/core/distributed_runtime:collective_param_resolver_distributed",
+        "//tensorflow/core/distributed_runtime:device_resolver_distributed",
         "//tensorflow/core/distributed_runtime:graph_mgr",
         "//tensorflow/core/distributed_runtime:local_master",
         "//tensorflow/core/distributed_runtime:master",
         "//tensorflow/core/distributed_runtime:master_env",
         "//tensorflow/core/distributed_runtime:master_session",
+        "//tensorflow/core/distributed_runtime:rpc_collective_executor_mgr",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_env",
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
index f5dc4c831d..9b863ccee5 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/eager/eager_grpc_server_lib.h
@@ -74,7 +74,7 @@ class EagerGrpcServer : public GrpcServer {
           this->eager_service_.reset(
               new eager::GrpcEagerServiceImpl(worker_env, server_builder));
         },
-        nullptr));
+        nullptr, nullptr));
 
     worker_session_ = WorkerSession::CreateWithBorrowedDeviceMgr(
         "", worker_name_,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index c0a9b43bf4..43dbe20836 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -27,6 +27,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 #include "tensorflow/core/distributed_runtime/local_master.h"
 #include "tensorflow/core/distributed_runtime/master.h"
@@ -38,6 +40,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_worker_service.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
 #include "tensorflow/core/framework/op.h"
@@ -106,6 +109,7 @@ GrpcServer::~GrpcServer() {
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
     const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+    const CollectiveMgrCreationFunction& collective_mgr_func,
     const WorkerCreationFunction& worker_func,
     const StatsPublisherFactory& stats_factory) {
   mutex_lock l(mu_);
@@ -204,6 +208,26 @@ Status GrpcServer::Init(
       WorkerCacheFactory(worker_cache_factory_options, &worker_cache));
   CHECK_NE(nullptr, worker_cache);
 
+  if (collective_mgr_func) {
+    worker_env_.collective_executor_mgr =
+        collective_mgr_func(config, &worker_env_, worker_cache);
+    if (!worker_env_.collective_executor_mgr) {
+      return errors::Internal(
+          "collective_mgr_func did not return CollectiveExecutorMgr");
+    }
+  } else {
+    std::unique_ptr<DeviceResolverDistributed> dev_resolver(
+        new DeviceResolverDistributed(worker_env_.device_mgr, worker_cache,
+                                      default_worker_name));
+    std::unique_ptr<CollectiveParamResolverDistributed> param_resolver(
+        new CollectiveParamResolverDistributed(config, worker_env_.device_mgr,
+                                               dev_resolver.get(), worker_cache,
+                                               default_worker_name));
+    worker_env_.collective_executor_mgr = new RpcCollectiveExecutorMgr(
+        config, worker_env_.device_mgr, std::move(dev_resolver),
+        std::move(param_resolver), worker_cache, default_worker_name);
+  }
+
   // Set up worker environment.
   worker_env_.session_mgr = new SessionMgr(
       &worker_env_, SessionMgr::WorkerNameFromServerDef(server_def_),
@@ -246,18 +270,21 @@ Status GrpcServer::Init(
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
     const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+    const CollectiveMgrCreationFunction& collective_mgr_func,
     const WorkerCreationFunction& worker_func) {
-  return Init(std::move(service_func), rendezvous_mgr_func, worker_func,
-              CreateNoOpStatsPublisher);
+  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
+              worker_func, CreateNoOpStatsPublisher);
 }
 
 Status GrpcServer::Init(
     ServiceInitFunction service_func,
-    const RendezvousMgrCreationFunction& rendezvous_mgr_func) {
-  return Init(service_func, rendezvous_mgr_func, nullptr);
+    const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+    const CollectiveMgrCreationFunction& collective_mgr_func) {
+  return Init(std::move(service_func), rendezvous_mgr_func, collective_mgr_func,
+              nullptr);
 }
 
-Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr); }
+Status GrpcServer::Init() { return Init(nullptr, nullptr, nullptr, nullptr); }
 
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
@@ -403,7 +430,7 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
   std::unique_ptr<GrpcServer> ret(
       new GrpcServer(server_def, env == nullptr ? Env::Default() : env));
   ServiceInitFunction service_func = nullptr;
-  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr));
+  TF_RETURN_IF_ERROR(ret->Init(service_func, NewRpcRendezvousMgr, nullptr));
   *out_server = std::move(ret);
   return Status::OK();
 }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
index b1c2eda0cf..ca9946cafc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/platform/env.h"
 
@@ -41,6 +42,11 @@ class Master;
 typedef std::function<RendezvousMgrInterface*(const WorkerEnv*)>
     RendezvousMgrCreationFunction;
 
+// function that creates a CollectiveExecutorMgr.
+typedef std::function<CollectiveExecutorMgrInterface*(
+    const ConfigProto&, const WorkerEnv*, WorkerCacheInterface*)>
+    CollectiveMgrCreationFunction;
+
 // function that registers a service to the server. The service needs to
 // be registered before builder.BuildAndStart().
 typedef std::function<void(const WorkerEnv*, ::grpc::ServerBuilder*)>
@@ -71,15 +77,18 @@ class GrpcServer : public ServerInterface {
  protected:
   Status Init(ServiceInitFunction service_func,
               const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+              const CollectiveMgrCreationFunction& collective_mgr_func,
               const WorkerCreationFunction& worker_func,
               const StatsPublisherFactory& stats_factory);
 
   Status Init(ServiceInitFunction service_func,
               const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+              const CollectiveMgrCreationFunction& collective_mgr_func,
               const WorkerCreationFunction& worker_func);
 
   Status Init(ServiceInitFunction service_func,
-              const RendezvousMgrCreationFunction& rendezvous_mgr_func);
+              const RendezvousMgrCreationFunction& rendezvous_mgr_func,
+              const CollectiveMgrCreationFunction& collective_mgr_func);
 
   Status Init();
 
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
new file mode 100644
index 0000000000..5eeed6e382
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -0,0 +1,142 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+
+#include "tensorflow/core/common_runtime/base_collective_executor.h"
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_rma_local.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/collective_rma_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/worker_cache.h"
+#include "tensorflow/core/lib/random/random.h"
+
+namespace tensorflow {
+
+RpcCollectiveExecutorMgr::RpcCollectiveExecutorMgr(
+    const ConfigProto& config, const DeviceMgr* dev_mgr,
+    std::unique_ptr<DeviceResolverDistributed> dev_resolver,
+    std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+    WorkerCacheInterface* worker_cache, const string& task_name)
+    : CollectiveExecutorMgr(config, dev_mgr, std::move(dev_resolver),
+                            std::move(param_resolver)),
+      worker_cache_(worker_cache),
+      task_name_(task_name) {
+  group_leader_ = (task_name == config.experimental().collective_group_leader())
+                      ? ""
+                      : config.experimental().collective_group_leader();
+}
+
+RpcCollectiveExecutorMgr::~RpcCollectiveExecutorMgr() {
+  for (auto it : sequence_table_) {
+    delete it.second;
+  }
+}
+
+CollectiveExecutor* RpcCollectiveExecutorMgr::Create(int64 step_id) {
+  CollectiveRemoteAccessDistributed* rma =
+      new CollectiveRemoteAccessDistributed(dev_mgr_, dev_resolver_.get(),
+                                            worker_cache_, step_id);
+  return new BaseCollectiveExecutor(this, rma, step_id, dev_mgr_);
+}
+
+namespace {
+// StepId must leave the most-significant 7 bits empty for future use.
+static const int64 kStepIdMask = (((1uLL << 56) - 1) | (1uLL << 56));
+
+int64 NewRandomStepId() {
+  int64 step_id = random::New64();
+  // Leave MS 8 bits clear for future use.
+  step_id &= kStepIdMask;
+  return step_id;
+}
+}  // namespace
+
+void RpcCollectiveExecutorMgr::RefreshStepIdSequenceAsync(
+    int64 graph_key, const StatusCallback& done) {
+  if (group_leader_.empty()) {
+    mutex_lock l(sequence_mu_);
+    GraphKeySequence* gks = nullptr;
+    auto it = sequence_table_.find(graph_key);
+    if (it == sequence_table_.end()) {
+      gks = new GraphKeySequence(graph_key);
+      sequence_table_[graph_key] = gks;
+    } else {
+      gks = it->second;
+    }
+    gks->next_step_id_ = NewRandomStepId();
+    done(Status::OK());
+  } else {
+    WorkerInterface* wi = worker_cache_->CreateWorker(group_leader_);
+    GetStepSequenceRequest* req = new GetStepSequenceRequest;
+    GetStepSequenceResponse* resp = new GetStepSequenceResponse;
+    req->add_graph_key(graph_key);
+    wi->GetStepSequenceAsync(
+        req, resp, [this, req, resp, done](const Status& s) {
+          if (!s.ok()) {
+            LOG(ERROR) << "Bad response [" << s
+                       << "] from GetStepSequenceAsync call to "
+                       << group_leader_;
+            done(s);
+          } else {
+            done(UpdateStepSequences(*resp));
+          }
+          delete req;
+          delete resp;
+        });
+  }
+}
+
+Status RpcCollectiveExecutorMgr::UpdateStepSequences(
+    const GetStepSequenceResponse& resp) {
+  mutex_lock l(sequence_mu_);
+  for (const StepSequence& ss : resp.step_sequence()) {
+    GraphKeySequence* gks = nullptr;
+    auto it = sequence_table_.find(ss.graph_key());
+    if (it == sequence_table_.end()) {
+      gks = new GraphKeySequence(ss.graph_key());
+      sequence_table_[ss.graph_key()] = gks;
+    } else {
+      gks = it->second;
+    }
+    gks->next_step_id_ = ss.next_step_id();
+  }
+  return Status::OK();
+}
+
+int64 RpcCollectiveExecutorMgr::NextStepId(int64 graph_key) {
+  mutex_lock l(sequence_mu_);
+  auto it = sequence_table_.find(graph_key);
+  if (it != sequence_table_.end()) {
+    return it->second->next_step_id_;
+  }
+  return CollectiveExecutor::kInvalidId;
+}
+
+void RpcCollectiveExecutorMgr::RetireStepId(int64 graph_key, int64 step_id) {
+  mutex_lock l(sequence_mu_);
+  auto it = sequence_table_.find(graph_key);
+  if (it != sequence_table_.end()) {
+    if (step_id == it->second->next_step_id_) {
+      it->second->next_step_id_ = (it->second->next_step_id_ + 1) & kStepIdMask;
+    } else {
+      it->second->next_step_id_ = CollectiveExecutor::kInvalidId;
+    }
+  } else {
+    LOG(ERROR) << "Failed to find graph_key " << graph_key << " to retire.";
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
new file mode 100644
index 0000000000..e9f3f0ebe8
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
+#define TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
+
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+
+namespace tensorflow {
+class CollectiveParamResolverDistributed;
+class ConfigProto;
+class DeviceMgr;
+class DeviceResolverDistributed;
+class WorkerCacheInterface;
+class StepSequenceRequest;
+class StepSequenceResponse;
+
+// An implementation of CollectiveExecutorMgr for a distributed environment
+// that uses WorkerInterface::RecvBufAsync to route data transfers over RPCs.
+//
+// In some execution environments it may be possible to implement a
+// higher-performance solution and use it in place of this class.
+class RpcCollectiveExecutorMgr : public CollectiveExecutorMgr {
+ public:
+  RpcCollectiveExecutorMgr(
+      const ConfigProto& config, const DeviceMgr* dev_mgr,
+      std::unique_ptr<DeviceResolverDistributed> dev_resolver,
+      std::unique_ptr<CollectiveParamResolverDistributed> param_resolver,
+      WorkerCacheInterface* worker_cache, const string& task_name);
+
+  virtual ~RpcCollectiveExecutorMgr();
+
+  void RefreshStepIdSequenceAsync(int64 graph_key,
+                                  const StatusCallback& done) override;
+
+  int64 NextStepId(int64 graph_key) override;
+
+  void RetireStepId(int64 graph_key, int64 step_id) override;
+
+ protected:
+  CollectiveExecutor* Create(int64 step_id) override;
+
+  WorkerCacheInterface* const worker_cache_;  // Not owned.
+  const string task_name_;
+  string group_leader_;
+  friend class RpcCollectiveExecutorMgrTest;
+
+ private:
+  Status UpdateStepSequences(const GetStepSequenceResponse& resp);
+
+  // This class maintains the step_id sequencing for a single
+  // collective_graph_key.
+  struct GraphKeySequence {
+    explicit GraphKeySequence(int64 k)
+        : graph_key_(k), next_step_id_(CollectiveExecutor::kInvalidId) {}
+
+    const int64 graph_key_;
+    int64 next_step_id_;
+  };
+
+  mutex sequence_mu_;
+  gtl::FlatMap<int64, GraphKeySequence*> sequence_table_
+      GUARDED_BY(sequence_mu_);
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_DISTRIBUTED_RUNTIME_RPC_COLLECTIVE_EXECUTOR_MGR_H_
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
new file mode 100644
index 0000000000..37b83d82be
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -0,0 +1,124 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <stdlib.h>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device.h"
+#include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/distributed_runtime/collective_param_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
+#include "tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.h"
+#include "tensorflow/core/framework/collective.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/public/session_options.h"
+
+namespace tensorflow {
+#define NUM_DEVS 3
+
+class RpcCollectiveExecutorMgrTest : public ::testing::Test {
+ protected:
+  RpcCollectiveExecutorMgrTest() {
+    string task_name = "/job:localhost/replica:0/task:0";
+    SessionOptions options;
+    options.config.mutable_experimental()->set_collective_group_leader(
+        task_name);
+    WorkerCacheInterface* worker_cache = nullptr;
+    auto* device_count = options.config.mutable_device_count();
+    device_count->insert({"CPU", NUM_DEVS});
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
+    device_mgr_.reset(new DeviceMgr(devices_));
+    std::unique_ptr<DeviceResolverDistributed> dr(new DeviceResolverDistributed(
+        device_mgr_.get(), worker_cache, task_name));
+    std::unique_ptr<CollectiveParamResolverDistributed> cpr(
+        new CollectiveParamResolverDistributed(options.config,
+                                               device_mgr_.get(), dr.get(),
+                                               worker_cache, task_name));
+    // This CME is the group leader.
+    cme_.reset(new RpcCollectiveExecutorMgr(options.config, device_mgr_.get(),
+                                            std::move(dr), std::move(cpr),
+                                            worker_cache, task_name));
+  }
+
+  std::unique_ptr<RpcCollectiveExecutorMgr> cme_;
+  std::vector<Device*> devices_;
+  std::unique_ptr<DeviceMgr> device_mgr_;
+};
+
+TEST_F(RpcCollectiveExecutorMgrTest, FindOrCreate) {
+  CollectiveExecutor::Handle* h =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_TRUE(h->get());
+  CollectiveExecutor::Handle* h2 =
+      new CollectiveExecutor::Handle(cme_->FindOrCreate(1), true);
+  EXPECT_EQ(h->get(), h2->get());
+  CollectiveExecutor* ce = h->get();
+  delete h;
+  delete h2;
+  CollectiveExecutor* ce2 = cme_->FindOrCreate(1);
+  EXPECT_EQ(ce, ce2);
+  ce2->Unref();
+  cme_->Cleanup(1);
+}
+
+TEST_F(RpcCollectiveExecutorMgrTest, NextStepId) {
+  int64 x = cme_->NextStepId(7);
+  EXPECT_EQ(x, CollectiveExecutor::kInvalidId);
+  // Calling Refresh should generate a valid id.
+  {
+    Notification note;
+    Status status;
+    cme_->RefreshStepIdSequenceAsync(7,
+                                     [this, &status, &note](const Status& s) {
+                                       status = s;
+                                       note.Notify();
+                                     });
+    EXPECT_TRUE(status.ok());
+  }
+  x = cme_->NextStepId(7);
+  EXPECT_NE(x, CollectiveExecutor::kInvalidId);
+  // Should keep returning same number.
+  EXPECT_EQ(x, cme_->NextStepId(7));
+  EXPECT_EQ(x, cme_->NextStepId(7));
+  // Retire on a different graph_key should have no effect.
+  cme_->RetireStepId(6, x);
+  EXPECT_EQ(x, cme_->NextStepId(7));
+  // Retire on same graph_key should advance.
+  cme_->RetireStepId(7, x);
+  int64 y = cme_->NextStepId(7);
+  EXPECT_EQ((x + 1) & (((1uLL << 56) - 1) | (1uLL << 56)), y);
+  // Calling refresh should jump to a different point in the random space.
+  {
+    Notification note;
+    Status status;
+    cme_->RefreshStepIdSequenceAsync(7,
+                                     [this, &status, &note](const Status& s) {
+                                       status = s;
+                                       note.Notify();
+                                     });
+
+    note.WaitForNotification();
+    EXPECT_TRUE(status.ok());
+  }
+  int64 z = cme_->NextStepId(7);
+  // z should not be equal to or a successor of y.
+  EXPECT_NE(y, z);
+  EXPECT_GT(llabs(y - z), 3);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 4e6500fbc6..1ea19c48f0 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
+#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
@@ -72,7 +73,8 @@ void Worker::RegisterGraphAsync(const RegisterGraphRequest* request,
     s = session->graph_mgr->Register(
         request->session_handle(), request->graph_def(),
         request->graph_options(), request->debug_options(),
-        session->cluster_flr.get(), response->mutable_graph_handle());
+        request->collective_graph_key(), session->cluster_flr.get(),
+        response->mutable_graph_handle());
   }
   done(s);
 }
@@ -315,6 +317,12 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
   if (env_->collective_executor_mgr) {
     env_->collective_executor_mgr->Cleanup(step_id);
   }
+  for (Device* d : env_->local_devices) {
+    ScopedAllocatorMgr* sam = d->GetScopedAllocatorMgr();
+    if (sam) {
+      sam->Cleanup(step_id);
+    }
+  }
   done(Status::OK());
 }
 
-- 
GitLab


From 898f9664488f0036ccc02bbb34379cb613f07a55 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Fri, 8 Jun 2018 18:17:55 -0700
Subject: [PATCH 526/610] Make LocallyConnected1D layer respect the data_format
 parameter.

PiperOrigin-RevId: 199879521
---
 tensorflow/python/keras/backend.py           | 19 ++++-
 tensorflow/python/keras/backend_test.py      | 47 +++++++++++
 tensorflow/python/keras/layers/local.py      | 44 +++++++++--
 tensorflow/python/keras/layers/local_test.py | 83 +++++++++++---------
 4 files changed, 144 insertions(+), 49 deletions(-)

diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index af3d1fa33d..2a4a1c861c 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -4242,7 +4242,11 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   """Apply 1D conv with un-shared weights.
 
   Arguments:
-      inputs: 3D tensor with shape: (batch_size, steps, input_dim)
+      inputs: 3D tensor with shape:
+              (batch_size, steps, input_dim)
+              if data_format is "channels_last" or
+              (batch_size, input_dim, steps)
+              if data_format is "channels_first".
       kernel: the unshared weight for convolution,
               with shape (output_length, feature_dim, filters)
       kernel_size: a tuple of a single integer,
@@ -4272,11 +4276,20 @@ def local_conv1d(inputs, kernel, kernel_size, strides, data_format=None):
   xs = []
   for i in range(output_length):
     slice_length = slice(i * stride, i * stride + kernel_size[0])
-    xs.append(reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
+    if data_format == 'channels_first':
+      xs.append(reshape(inputs[:, :, slice_length], (1, -1, feature_dim)))
+    else:
+      xs.append(reshape(inputs[:, slice_length, :], (1, -1, feature_dim)))
+
   x_aggregate = concatenate(xs, axis=0)
   # Shape: `(output_length, batch_size, filters)`.
   output = batch_dot(x_aggregate, kernel)
-  return permute_dimensions(output, (1, 0, 2))
+
+  if data_format == 'channels_first':
+    output = permute_dimensions(output, (1, 2, 0))
+  else:
+    output = permute_dimensions(output, (1, 0, 2))
+  return output
 
 
 def local_conv2d(inputs,
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index 58df263a4f..53e30e0e4a 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -810,6 +810,53 @@ class BackendNNOpsTest(test.TestCase):
                              padding='same', data_format='channels_last')
     self.assertEqual(y.get_shape().as_list(), [10, 5, 5])
 
+  def test_local_conv1d_channels_dim(self):
+    input_length = 5
+    input_dim = 3
+    batch_size = 2
+
+    inputs = np.random.normal(0, 1, (batch_size, input_dim, input_length))
+    inputs_cf = keras.backend.variable(inputs)
+
+    filters = 4
+    for kernel_size in [(1,), (2,), (3,)]:
+      for strides in [(1,), (2,), (3,)]:
+        output_length = (input_length - kernel_size[0]
+                         + strides[0]) // strides[0]
+
+        kernel_shape = (output_length, kernel_size[0] * input_dim, filters)
+        kernel = np.random.normal(0, 1, (output_length,
+                                         input_dim,
+                                         kernel_size[0],
+                                         filters))
+        kernel_cf = np.reshape(kernel, kernel_shape)
+        kernel_cf = keras.backend.variable(kernel_cf)
+
+        conv_cf = keras.backend.local_conv1d(inputs_cf,
+                                             kernel_cf,
+                                             kernel_size,
+                                             strides,
+                                             'channels_first')
+
+        inputs_cl = np.transpose(inputs, (0, 2, 1))
+        inputs_cl = keras.backend.variable(inputs_cl)
+
+        kernel_cl = np.reshape(np.transpose(kernel, (0, 2, 1, 3)),
+                               kernel_shape)
+        kernel_cl = keras.backend.variable(kernel_cl)
+
+        conv_cl = keras.backend.local_conv1d(inputs_cl,
+                                             kernel_cl,
+                                             kernel_size,
+                                             strides,
+                                             'channels_last')
+        with self.test_session():
+          conv_cf = keras.backend.eval(conv_cf)
+          conv_cl = keras.backend.eval(conv_cl)
+
+        self.assertAllCloseAccordingToType(conv_cf,
+                                           np.transpose(conv_cl, (0, 2, 1)))
+
   def test_conv2d(self):
     val = np.random.random((10, 4, 10, 10))
     x = keras.backend.variable(val)
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 46c18b763e..f222ea3083 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -62,6 +62,16 @@ class LocallyConnected1D(Layer):
           any `dilation_rate` value != 1.
       padding: Currently only supports `"valid"` (case-insensitive).
           `"same"` may be supported in the future.
+      data_format: A string,
+          one of `channels_last` (default) or `channels_first`.
+          The ordering of the dimensions in the inputs.
+          `channels_last` corresponds to inputs with shape
+          `(batch, length, channels)` while `channels_first`
+          corresponds to inputs with shape
+          `(batch, channels, length)`.
+          It defaults to the `image_data_format` value found in your
+          Keras config file at `~/.keras/keras.json`.
+          If you never set it, then it will be "channels_last".
       activation: Activation function to use.
           If you don't specify anything, no activation is applied
           (ie. "linear" activation: `a(x) = x`).
@@ -122,12 +132,16 @@ class LocallyConnected1D(Layer):
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
-    input_dim = input_shape[2]
+    if self.data_format == 'channels_first':
+      input_dim, input_length = input_shape[1], input_shape[2]
+    else:
+      input_dim, input_length = input_shape[2], input_shape[1]
+
     if input_dim is None:
       raise ValueError('Axis 2 of input should be fully-defined. '
                        'Found shape:', input_shape)
     output_length = conv_utils.conv_output_length(
-        input_shape[1], self.kernel_size[0], self.padding, self.strides[0])
+        input_length, self.kernel_size[0], self.padding, self.strides[0])
     self.kernel_shape = (output_length, self.kernel_size[0] * input_dim,
                          self.filters)
     self.kernel = self.add_weight(
@@ -145,19 +159,33 @@ class LocallyConnected1D(Layer):
           constraint=self.bias_constraint)
     else:
       self.bias = None
-    self.input_spec = InputSpec(ndim=3, axes={2: input_dim})
+
+    if self.data_format == 'channels_first':
+      self.input_spec = InputSpec(ndim=3, axes={1: input_dim})
+    else:
+      self.input_spec = InputSpec(ndim=3, axes={-1: input_dim})
     self.built = True
 
   @tf_utils.shape_type_conversion
   def compute_output_shape(self, input_shape):
-    length = conv_utils.conv_output_length(input_shape[1], self.kernel_size[0],
+    if self.data_format == 'channels_first':
+      input_length = input_shape[2]
+    else:
+      input_length = input_shape[1]
+
+    length = conv_utils.conv_output_length(input_length, self.kernel_size[0],
                                            self.padding, self.strides[0])
-    return (input_shape[0], length, self.filters)
+
+    if self.data_format == 'channels_first':
+      return (input_shape[0], self.filters, length)
+    elif self.data_format == 'channels_last':
+      return (input_shape[0], length, self.filters)
 
   def call(self, inputs):
-    output = K.local_conv1d(inputs, self.kernel, self.kernel_size, self.strides)
+    output = K.local_conv1d(inputs, self.kernel, self.kernel_size,
+                            self.strides, self.data_format)
     if self.use_bias:
-      output = K.bias_add(output, self.bias)
+      output = K.bias_add(output, self.bias, data_format=self.data_format)
     if self.activation is not None:
       output = self.activation(output)
     return output
@@ -172,6 +200,8 @@ class LocallyConnected1D(Layer):
             self.strides,
         'padding':
             self.padding,
+        'data_format':
+            self.data_format,
         'activation':
             activations.serialize(self.activation),
         'use_bias':
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 90ae1719e1..9123d449af 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -40,16 +40,17 @@ class LocallyConnectedLayersTest(test.TestCase):
       for strides in [1]:
         if padding == 'same' and strides != 1:
           continue
-
-        testing_utils.layer_test(
-            keras.layers.LocallyConnected1D,
-            kwargs={
-                'filters': filters,
-                'kernel_size': filter_length,
-                'padding': padding,
-                'strides': strides
-            },
-            input_shape=(num_samples, num_steps, input_dim))
+        for data_format in ['channels_first', 'channels_last']:
+          testing_utils.layer_test(
+              keras.layers.LocallyConnected1D,
+              kwargs={
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': data_format
+              },
+              input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -57,35 +58,39 @@ class LocallyConnectedLayersTest(test.TestCase):
     input_dim = 5
     filter_length = 3
     filters = 4
-    kwargs = {
-        'filters': filters,
-        'kernel_size': filter_length,
-        'kernel_regularizer': 'l2',
-        'bias_regularizer': 'l2',
-        'activity_regularizer': 'l2',
-    }
-
-    with self.test_session():
-      layer = keras.layers.LocallyConnected1D(**kwargs)
-      layer.build((num_samples, num_steps, input_dim))
-      self.assertEqual(len(layer.losses), 2)
-      layer(
-          keras.backend.variable(np.ones((num_samples, num_steps, input_dim))))
-      self.assertEqual(len(layer.losses), 3)
-
-    k_constraint = keras.constraints.max_norm(0.01)
-    b_constraint = keras.constraints.max_norm(0.01)
-    kwargs = {
-        'filters': filters,
-        'kernel_size': filter_length,
-        'kernel_constraint': k_constraint,
-        'bias_constraint': b_constraint,
-    }
-    with self.test_session():
-      layer = keras.layers.LocallyConnected1D(**kwargs)
-      layer.build((num_samples, num_steps, input_dim))
-      self.assertEqual(layer.kernel.constraint, k_constraint)
-      self.assertEqual(layer.bias.constraint, b_constraint)
+    for data_format in ['channels_first', 'channels_last']:
+      kwargs = {
+          'filters': filters,
+          'kernel_size': filter_length,
+          'kernel_regularizer': 'l2',
+          'bias_regularizer': 'l2',
+          'activity_regularizer': 'l2',
+          'data_format': data_format
+      }
+
+      with self.test_session():
+        layer = keras.layers.LocallyConnected1D(**kwargs)
+        layer.build((num_samples, num_steps, input_dim))
+        self.assertEqual(len(layer.losses), 2)
+        layer(
+            keras.backend.variable(np.ones((num_samples,
+                                            num_steps,
+                                            input_dim))))
+        self.assertEqual(len(layer.losses), 3)
+
+      k_constraint = keras.constraints.max_norm(0.01)
+      b_constraint = keras.constraints.max_norm(0.01)
+      kwargs = {
+          'filters': filters,
+          'kernel_size': filter_length,
+          'kernel_constraint': k_constraint,
+          'bias_constraint': b_constraint,
+      }
+      with self.test_session():
+        layer = keras.layers.LocallyConnected1D(**kwargs)
+        layer.build((num_samples, num_steps, input_dim))
+        self.assertEqual(layer.kernel.constraint, k_constraint)
+        self.assertEqual(layer.bias.constraint, b_constraint)
 
   @tf_test_util.run_in_graph_and_eager_modes()
   def test_locallyconnected_2d(self):
-- 
GitLab


From 14e7f42ae0ff488b83f00cccaf350aec1032af5c Mon Sep 17 00:00:00 2001
From: Sami Kama <samikama@users.noreply.github.com>
Date: Sat, 9 Jun 2018 09:16:02 -0700
Subject: [PATCH 527/610] * Use VLOG(1) instead of std::cout in remapper.cc
 (#19870)

* Remove op_op_lib dependency from ScopedAllocator. This dependency is
  already satisfied through core and causes a fatal for libraries that
  uses meta_optimizer due to double registration.
---
 tensorflow/core/grappler/optimizers/BUILD       | 1 -
 tensorflow/core/grappler/optimizers/remapper.cc | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 20887bc218..2073c2968b 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -780,7 +780,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:scoped_allocator_ops_op_lib",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index efd870b118..622fb134a1 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 namespace grappler {
@@ -200,8 +201,7 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
         }
       }
       if (optimizable) {
-        std::cout << "Optimizing fused batch norm node " << node.DebugString()
-                  << std::endl;
+        VLOG(1)<< "Optimizing fused batch norm node " << node.DebugString();
         AddBatchNormNodes(optimized_graph, node);
         continue;
       }
-- 
GitLab


From 119db15241e29587e0b6ab3912bff5ff63d123eb Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Sat, 9 Jun 2018 10:39:16 -0700
Subject: [PATCH 528/610] Add a registration mechanism for experimental
 executor implementations.

Also add an option to the FunctionLibraryRuntime's `InstantiateOptions` that
enables users to select a particular executor implementation when instantiating
a function.

PiperOrigin-RevId: 199920648
---
 tensorflow/core/BUILD                         |  2 +
 tensorflow/core/common_runtime/executor.cc    | 27 ++++++
 .../core/common_runtime/executor_factory.cc   | 85 +++++++++++++++++++
 .../core/common_runtime/executor_factory.h    | 51 +++++++++++
 .../core/common_runtime/executor_test.cc      |  4 +-
 tensorflow/core/common_runtime/function.cc    | 16 ++--
 .../core/common_runtime/function_test.cc      | 72 +++++++++++++++-
 .../kernel_benchmark_testlib.cc               | 18 ++--
 .../common_runtime/kernel_benchmark_testlib.h |  4 +-
 tensorflow/core/framework/function.cc         |  4 +
 tensorflow/core/framework/function.h          |  6 ++
 11 files changed, 267 insertions(+), 22 deletions(-)
 create mode 100644 tensorflow/core/common_runtime/executor_factory.cc
 create mode 100644 tensorflow/core/common_runtime/executor_factory.h

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 5ff65f4f72..f17f39099a 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -2633,6 +2633,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/dma_helper.h",
     "common_runtime/eigen_thread_pool.h",
     "common_runtime/executor.h",
+    "common_runtime/executor_factory.h",
     "common_runtime/graph_optimizer.h",
     "common_runtime/local_device.h",
     "common_runtime/lower_if_op.h",
@@ -2682,6 +2683,7 @@ tf_cuda_library(
         "common_runtime/device_resolver_local.cc",
         "common_runtime/device_set.cc",
         "common_runtime/executor.cc",
+        "common_runtime/executor_factory.cc",
         "common_runtime/function.cc",
         "common_runtime/graph_optimizer.cc",
         "common_runtime/graph_runner.cc",
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 585d777e81..f7f2cdc14f 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/costmodel_manager.h"
+#include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/pending_counts.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/allocation_description.pb.h"
@@ -2764,4 +2765,30 @@ Status CreateNonCachedKernel(Device* device, FunctionLibraryRuntime* flib,
 
 void DeleteNonCachedKernel(OpKernel* kernel) { delete kernel; }
 
+namespace {
+
+class DefaultExecutorRegistrar {
+ public:
+  DefaultExecutorRegistrar() {
+    Factory* factory = new Factory;
+    ExecutorFactory::Register("", factory);
+    ExecutorFactory::Register("DEFAULT", factory);
+  }
+
+ private:
+  class Factory : public ExecutorFactory {
+    Status NewExecutor(const LocalExecutorParams& params,
+                       std::unique_ptr<const Graph> graph,
+                       std::unique_ptr<Executor>* out_executor) override {
+      Executor* ret = nullptr;
+      TF_RETURN_IF_ERROR(NewLocalExecutor(params, std::move(graph), &ret));
+      out_executor->reset(ret);
+      return Status::OK();
+    }
+  };
+};
+static DefaultExecutorRegistrar registrar;
+
+}  // namespace
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor_factory.cc b/tensorflow/core/common_runtime/executor_factory.cc
new file mode 100644
index 0000000000..ee7c7c3a73
--- /dev/null
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@@ -0,0 +1,85 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/executor_factory.h"
+
+#include <unordered_map>
+
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+namespace {
+
+static mutex executor_factory_lock(LINKER_INITIALIZED);
+
+typedef std::unordered_map<string, ExecutorFactory*> ExecutorFactories;
+ExecutorFactories* executor_factories() {
+  static ExecutorFactories* factories = new ExecutorFactories;
+  return factories;
+}
+
+}  // namespace
+
+void ExecutorFactory::Register(const string& executor_type,
+                               ExecutorFactory* factory) {
+  mutex_lock l(executor_factory_lock);
+  if (!executor_factories()->insert({executor_type, factory}).second) {
+    LOG(FATAL) << "Two executor factories are being registered "
+               << "under" << executor_type;
+  }
+}
+
+namespace {
+const string RegisteredFactoriesErrorMessageLocked()
+    SHARED_LOCKS_REQUIRED(executor_factory_lock) {
+  std::vector<string> factory_types;
+  for (const auto& executor_factory : *executor_factories()) {
+    factory_types.push_back(executor_factory.first);
+  }
+  return strings::StrCat("Registered factories are {",
+                         str_util::Join(factory_types, ", "), "}.");
+}
+}  // namespace
+
+Status ExecutorFactory::GetFactory(const string& executor_type,
+                                   ExecutorFactory** out_factory) {
+  tf_shared_lock l(executor_factory_lock);
+
+  auto iter = executor_factories()->find(executor_type);
+  if (iter == executor_factories()->end()) {
+    return errors::NotFound(
+        "No executor factory registered for the given executor type: ",
+        executor_type, " ", RegisteredFactoriesErrorMessageLocked());
+  }
+
+  *out_factory = iter->second;
+  return Status::OK();
+}
+
+Status NewExecutor(const string& executor_type,
+                   const LocalExecutorParams& params,
+                   std::unique_ptr<const Graph> graph,
+                   std::unique_ptr<Executor>* out_executor) {
+  ExecutorFactory* factory = nullptr;
+  TF_RETURN_IF_ERROR(ExecutorFactory::GetFactory(executor_type, &factory));
+  return factory->NewExecutor(params, std::move(graph), out_executor);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/executor_factory.h b/tensorflow/core/common_runtime/executor_factory.h
new file mode 100644
index 0000000000..f81bb080eb
--- /dev/null
+++ b/tensorflow/core/common_runtime/executor_factory.h
@@ -0,0 +1,51 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_FACTORY_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_FACTORY_H_
+
+#include <string>
+
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace tensorflow {
+
+class Executor;
+class Graph;
+struct LocalExecutorParams;
+
+class ExecutorFactory {
+ public:
+  virtual Status NewExecutor(const LocalExecutorParams& params,
+                             std::unique_ptr<const Graph> graph,
+                             std::unique_ptr<Executor>* out_executor) = 0;
+  virtual ~ExecutorFactory() {}
+
+  static void Register(const string& executor_type, ExecutorFactory* factory);
+  static Status GetFactory(const string& executor_type,
+                           ExecutorFactory** out_factory);
+};
+
+Status NewExecutor(const string& executor_type,
+                   const LocalExecutorParams& params,
+                   std::unique_ptr<const Graph> graph,
+                   std::unique_ptr<Executor>* out_executor);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EXECUTOR_FACTORY_H_
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index b24969613c..7697103faf 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -464,8 +464,8 @@ BENCHMARK(BM_executor)->ArgPair(1024, 1024);
 static void BM_FeedInputFetchOutput(int iters) {
   Graph* g = new Graph(OpRegistry::Global());
   // z = x + y: x and y are provided as benchmark inputs.  z is the
-  // output of the benchmark.  Conceptually, the caller is "a", the
-  // benchmark is "b".
+  // output of the benchmark.  Conceptually, the caller is ALICE, the
+  // benchmark is BOB.
   Node* x = test::graph::Recv(g, "x", "float", ALICE, 1, BOB);
   Node* y = test::graph::Recv(g, "y", "float", ALICE, 1, BOB);
   Node* sum = test::graph::Add(g, x, y);
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 5d9be70522..68d37ddbcd 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/graph_optimizer.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
@@ -215,6 +216,7 @@ class FunctionLibraryRuntimeImpl : public FunctionLibraryRuntime {
     const FunctionLibraryDefinition* overlay_lib = nullptr;  // Not owned.
     FunctionBody* func_graph = nullptr;
     Executor* exec = nullptr;
+    string executor_type;
 
     ~Item() {
       delete this->func_graph;
@@ -549,6 +551,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
       item->func_graph = fbody;
       item->overlay_lib = options.overlay_lib;
       item->instantiation_counter = 1;
+      item->executor_type = options.executor_type;
       items_.emplace(next_handle_, std::unique_ptr<Item>(item));
       next_handle_++;
     }
@@ -623,10 +626,12 @@ void PruneFunctionBody(Graph* g) {
 Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
   const FunctionBody* fbody;
   const FunctionLibraryDefinition* lib_def;
+  string executor_type;
   {
     mutex_lock l(mu_);
     fbody = (*item)->func_graph;
     lib_def = (*item)->overlay_lib;
+    executor_type = (*item)->executor_type;
   }
   if (!lib_def) {
     lib_def = base_lib_def_;
@@ -656,17 +661,14 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Handle handle, Item** item) {
     DeleteNonCachedKernel(kernel);
   };
   Graph* graph = g.get();
-  Executor* exec;
-  TF_RETURN_IF_ERROR(NewLocalExecutor(params, std::move(g), &exec));
-
+  std::unique_ptr<Executor> exec;
+  TF_RETURN_IF_ERROR(NewExecutor(executor_type, params, std::move(g), &exec));
   {
     // Guard item since it is already inserted in items_.
     mutex_lock l(mu_);
-    if ((*item)->exec) {
-      delete exec;
-    } else {
+    if ((*item)->exec == nullptr) {
       (*item)->graph = graph;
-      (*item)->exec = exec;
+      (*item)->exec = exec.release();
     }
   }
   return Status::OK();
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index f4f5198396..1e837e9a7e 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
+#include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
@@ -531,6 +532,69 @@ TEST_F(FunctionLibraryRuntimeTest, StateHandle) {
   }
 }
 
+namespace {
+class DummyExecutorRegistrar {
+ public:
+  DummyExecutorRegistrar() {
+    ExecutorFactory::Register("DUMMY", new Factory());
+  }
+
+ private:
+  class Factory : public ExecutorFactory {
+    Status NewExecutor(const LocalExecutorParams& params,
+                       std::unique_ptr<const Graph> graph,
+                       std::unique_ptr<Executor>* out_executor) override {
+      return errors::Internal("This is a dummy.");
+    }
+  };
+};
+static DummyExecutorRegistrar registrar;
+}  // namespace
+
+TEST_F(FunctionLibraryRuntimeTest, ExecutorFactory) {
+  Init({test::function::XTimesTwo()});
+
+  auto x = test::AsTensor<float>({1, 2, 3, 4});
+  Tensor y;
+
+  // Test that the default executor works.
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    options.executor_type = "";
+    TF_CHECK_OK(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}},
+                                  options, {x}, {&y}));
+    test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  }
+
+  // Test the explicit registration for the default executor.
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    options.executor_type = "DEFAULT";
+    TF_CHECK_OK(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}},
+                                  options, {x}, {&y}));
+    test::ExpectTensorEqual<float>(y, test::AsTensor<float>({2, 4, 6, 8}));
+  }
+
+  // Test that a non-default executor factory can be invoked.
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    options.executor_type = "DUMMY";
+    HasError(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, options,
+                               {x}, {&y}),
+             "Internal: This is a dummy.");
+  }
+
+  // Test that non-existent exector types trigger an error.
+  {
+    FunctionLibraryRuntime::InstantiateOptions options;
+    options.executor_type = "UNKNOWN_EXECUTOR";
+    HasError(InstantiateAndRun(flr0_, "XTimesTwo", {{"T", DT_FLOAT}}, options,
+                               {x}, {&y}),
+             "Not found: No executor factory registered for the given executor "
+             "type: UNKNOWN_EXECUTOR");
+  }
+}
+
 TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctions) {
   Init({test::function::XTimesTwo(), test::function::XTimesFour(),
         test::function::XTimes16()});
@@ -803,7 +867,7 @@ TEST_F(FunctionLibraryRuntimeTest, OptimizeGraph) {
     Scope s = Scope::NewRootScope();
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto x4_x2_scale = ops::Const<float>(
-        s.WithOpName("x4/x2/scale/_12__cf__6")
+        s.WithOpName("x4/x2/scale/_12__cf__10")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto x4_x2_y = ops::Mul(s.WithOpName("x4/x2/y"), x, x4_x2_scale);
@@ -913,7 +977,7 @@ TEST_F(FunctionLibraryRuntimeTest, Error_NotFound) {
            "Not found: Function Foo is not defined.");
 }
 
-TEST_F(FunctionLibraryRuntimeTest, Error_InstantiaionError) {
+TEST_F(FunctionLibraryRuntimeTest, Error_InstantiationError) {
   auto bad_x_times_two = FDH::Define(
       // Name
       "XTimesTwo",
@@ -1009,13 +1073,13 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_XTimesTwo) {
     auto x = ops::_Arg(s.WithOpName("x"), DT_FLOAT, 0);
     auto func0 = ops::_Arg(s.WithOpName("Func/_0"), DT_FLOAT, 1);
     auto scale = ops::Const(
-        s.WithOpName("scale/_6__cf__11")
+        s.WithOpName("scale/_6__cf__15")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         2.0f);
     auto func1_gx = ops::Mul(s.WithOpName("Func/_1/gx"), func0, scale);
     auto func1_sx = ops::Shape(s.WithOpName("Func/_1/sx"), x);
     auto const0 = ops::Const(
-        s.WithOpName("Func/_1/sy/_5__cf__10")
+        s.WithOpName("Func/_1/sy/_5__cf__14")
             .WithDevice("/job:localhost/replica:0/task:0/device:CPU:0"),
         0, {0});
     auto func1_rx = ops::internal::BroadcastGradientArgs(
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 7de1b80e2d..1f585a8c24 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <vector>
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/executor_factory.h"
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -43,7 +44,7 @@ namespace test {
 // TODO(hongm): Convert `g` and `init` to using std::unique_ptr.
 Benchmark::Benchmark(const string& device, Graph* g,
                      const SessionOptions* options, Graph* init,
-                     Rendezvous* rendez) {
+                     Rendezvous* rendez, const char* executor_type) {
   SessionOptions default_options;
   if (!options) {
     options = &default_options;
@@ -86,23 +87,26 @@ Benchmark::Benchmark(const string& device, Graph* g,
   };
 
   if (init) {
-    Executor* init_exec;
-    TF_CHECK_OK(
-        NewLocalExecutor(params, std::unique_ptr<Graph>(init), &init_exec));
+    std::unique_ptr<Executor> init_exec;
+    TF_CHECK_OK(NewExecutor(executor_type, params, std::unique_ptr<Graph>(init),
+                            &init_exec));
     Executor::Args args;
     args.rendezvous = rendez_;
     args.runner = runner;
     TF_CHECK_OK(init_exec->Run(args));
-    delete init_exec;
   }
 
-  TF_CHECK_OK(NewLocalExecutor(params, std::unique_ptr<Graph>(g), &exec_));
+  TF_CHECK_OK(
+      NewExecutor(executor_type, params, std::unique_ptr<Graph>(g), &exec_));
 }
 
 Benchmark::~Benchmark() {
   if (device_) {
     rendez_->Unref();
-    delete exec_;
+    // We delete `exec_` before `device_` because the `exec_` destructor may
+    // run kernel destructors that may attempt to access state borrowed from
+    // `device_`, such as the resource manager.
+    exec_.reset();
     delete device_;
     delete pool_;
   }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 3a7b3a5ace..995a15a299 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -39,7 +39,7 @@ class Benchmark {
   // "init", and one reference on "rendez" (if not null).
   Benchmark(const string& device, Graph* g,
             const SessionOptions* options = nullptr, Graph* init = nullptr,
-            Rendezvous* rendez = nullptr);
+            Rendezvous* rendez = nullptr, const char* executor_type = "");
   ~Benchmark();
 
   // Executes the graph for "iters" times.
@@ -57,7 +57,7 @@ class Benchmark {
   thread::ThreadPool* pool_ = nullptr;
   Device* device_ = nullptr;
   Rendezvous* rendez_ = nullptr;
-  Executor* exec_ = nullptr;
+  std::unique_ptr<Executor> exec_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Benchmark);
 };
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index 647c66099c..88d9d65f5a 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -815,6 +815,10 @@ string Canonicalize(const string& funcname, AttrSlice attrs,
     entries.push_back(
         strings::StrCat("_state_handle", "=", options.state_handle));
   }
+  if (!options.executor_type.empty()) {
+    entries.push_back(
+        strings::StrCat("_executor_type", "=", options.executor_type));
+  }
   std::sort(entries.begin(), entries.end());
   return strings::StrCat(funcname, "[", str_util::Join(entries, ","), "]");
 }
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 872906756a..8e607b927c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -450,6 +450,12 @@ class FunctionLibraryRuntime {
     // state (in stateful kernels); and two functions with different
     // values for `state_handle` will have independent state.
     string state_handle;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instatiates the function using an executor of the given type. If empty,
+    // the default TensorFlow executor will be used.
+    string executor_type;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const string& function_name, AttrSlice attrs,
-- 
GitLab


From a4b390bffbcb01d8f57f25c007277d457f752a69 Mon Sep 17 00:00:00 2001
From: Amit Patankar <amitpatankar@google.com>
Date: Sat, 9 Jun 2018 13:24:11 -0700
Subject: [PATCH 529/610] Fixing copy_binary script. (#19865)

* Allowing for copy_binary to have the minor version to have double digits.

* Fix the linting error.

* Remove one space for pylint.
---
 tensorflow/tools/ci_build/copy_binary.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tensorflow/tools/ci_build/copy_binary.py b/tensorflow/tools/ci_build/copy_binary.py
index 420d390d2b..148526492d 100755
--- a/tensorflow/tools/ci_build/copy_binary.py
+++ b/tensorflow/tools/ci_build/copy_binary.py
@@ -32,7 +32,8 @@ import shutil
 import tempfile
 import zipfile
 
-TF_NIGHTLY_REGEX = r"(.+)tf_nightly(|_gpu)-(\d\.\d\.\d.dev[\d]{0,8})-(.+)\.whl"
+TF_NIGHTLY_REGEX = (r"(.+)tf_nightly(|_gpu)-(\d\.[\d]{1,2}"
+                    "\.\d.dev[\d]{0,8})-(.+)\.whl")
 BINARY_STRING_TEMPLATE = "%s-%s-%s.whl"
 
 
-- 
GitLab


From 3a1d8bd815b5216bc9515801e4d59cf3ebd1126d Mon Sep 17 00:00:00 2001
From: James Qin <jamesqin@google.com>
Date: Sun, 10 Jun 2018 22:15:46 -0700
Subject: [PATCH 530/610] Improve the loss_scale_optimizer docstring.

PiperOrigin-RevId: 200001771
---
 .../python/loss_scale_optimizer.py            | 42 +++++++++++--------
 1 file changed, 24 insertions(+), 18 deletions(-)

diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
index e4e5ccc334..ef34f7bf7b 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer.py
@@ -26,26 +26,32 @@ from tensorflow.python.training import optimizer
 
 
 class LossScaleOptimizer(optimizer.Optimizer):
+  # TODO(jamesqin): move mixed precision training explanation to __init__
+  # docstring.
   """An optimizer that applies loss scaling in backprop.
 
-  This class is useful for mixed precision training on GPUs (or other potential
-  accelerators), which is an approach to improve compute throughput without loss
-  of model quality.
-
-  The commmon configuration of mixed precision models is the following:
-  * variables are kept in high precision (e.g. float32).
-  * computations are done in lower precision (e.g. float16). variables are
-    casted to lower precision before they're used.
-  * (in training), final gradients are casted back to variable precision and get
-    applied.
-
-  Because computations happen in lower precision, gradients in the backprop pass
-  might underflow in the smaller dynamic range, causing a model to converge at a
-  suboptimal level. This optimizer multiplies the loss by a factor before
-  backprop starts to prevent underflow. Before gradients are applied, they are
-  casted to higher precision and down-scaled by the same factor, so
-  mathematically the variable updates are no different from regular
-  same-precision training.
+  This class is useful for "mixed precision training" on GPUs (or other
+  potential accelerators), an approach to improve compute throughput without
+  compromising model quality.
+
+  The canonical way to perform mixed precision training is the following:
+  * Model variables are kept in high precision (e.g. float32).
+  * Computations are done in lower precision (e.g. float16), which enjoys
+    performance speedup by virtue of hardware support. Variables are casted to
+    lower precision before they're used.
+  * Final gradients are casted back to high precision dtype, then used to update
+    variables.
+
+  The side-effect of performing computation in lower precision, is that it comes
+  with smaller numerical range. During backproping, small gradients might
+  underflow in the reduced numerical range, causing a model to converge at
+  suboptimal level.
+
+  To prevent underflow, this optimizer multiplies the loss by a factor before
+  backprop starts. Consequently, the gradients are linearly scaled up by the
+  same factor, thus not falling into the underflow zone. After that, to perserve
+  the correctness of backprop, the gradients are down-scaled by the same factor,
+  casted to the (higher) variable precision, then applied on the variables.
 
   See [Nvidia's manual on mixed precision training](
   https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html)
-- 
GitLab


From 73c479056aca52e83f84d7df4132c420f1f3feed Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 03:36:33 -0700
Subject: [PATCH 531/610] [TuplePointsToAnalysis] Be less conservative on loop
 fusion nodes when reusing buffer.

Previously, we say we cannot reuse operand buffer for a loop fusion
node if any of the fusion's inputs is a broadcast or reshape. That's
too conservative since in theory we can still reuse the operand's
buffer if all the users of that particular operand are elementwise.
This CL implements that. Allow sharding operand and output buffer for
partially elementwise fusions.

The same change have been recently applyed to DataFlowAnalysis as well
but we use this pass in many places as well.

PiperOrigin-RevId: 200028414
---
 .../xla/service/tuple_points_to_analysis.cc   | 27 +++++++++++--------
 .../service/tuple_points_to_analysis_test.cc  | 25 +++++++++++++++++
 2 files changed, 41 insertions(+), 11 deletions(-)

diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index bb634e6573..eb6d1ada6b 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -723,15 +723,16 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
     return false;
   }
   if (user->opcode() == HloOpcode::kFusion) {
-    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop &&
-        user->fused_expression_root()->opcode() ==
-            HloOpcode::kDynamicUpdateSlice) {
-      // Loop fusion with kDynamicUpdateSlice fused root.
-      //
-      // Returns true iff there is exactly one use of 'operand' at shape index
-      // 'operand_index', and this singleton use is the fused root at operand
-      // index 0.
-      return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0);
+    if (user->fusion_kind() == HloInstruction::FusionKind::kLoop) {
+      if (user->fused_expression_root()->opcode() ==
+          HloOpcode::kDynamicUpdateSlice) {
+        // Loop fusion with kDynamicUpdateSlice fused root.
+        //
+        // Returns true iff there is exactly one use of 'operand' at shape index
+        // 'operand_index', and this singleton use is the fused root at operand
+        // index 0.
+        return HasUniqueFusedUseOfOperandAt(operand, operand_index, user, 0);
+      }
     } else if (user->fusion_kind() == HloInstruction::FusionKind::kOutput &&
                user->fused_expression_root()->opcode() == HloOpcode::kAdd) {
       // Output fusion with kAdd fused root.
@@ -789,8 +790,12 @@ bool TuplePointsToAnalysis::CanShareOperandBufferWithUser(
     return param_uses.size() == 1 && param_uses[0].first == callee_root &&
            callee_root->IsElementwiseOnOperand(param_uses[0].second);
   }
-  // Check if 'user' is element-wise.
-  return user->IsElementwise();
+  // Loop fusions that contain transposing copies won't reach here as they have
+  // different layouts, which fails the check in the beginning of this function.
+  //
+  // Multi-output fusion will fail the check here as tuples are not considered
+  // an elementwise operation.
+  return user->IsElementwiseOnOperand(user->operand_index(operand));
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index f558316b05..5734f28407 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -1148,5 +1148,30 @@ TEST_F(CanShareOperandBufferWithUserTest, CallToComputationWithFusionRoot) {
                                                                  call, {}));
 }
 
+TEST_F(CanShareOperandBufferWithUserTest, LoopFusionWithElementwiseOperand) {
+  Shape full_shape = ShapeUtil::MakeShape(F32, {16, 32});
+  Shape broadcast_shape = ShapeUtil::MakeShape(F32, {16});
+
+  auto builder = HloComputation::Builder(TestName() + "_fusion");
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, full_shape, "full"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, broadcast_shape, "small"));
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(full_shape, param1, {0}));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      full_shape, HloOpcode::kAdd, param0, broadcast));
+
+  BuildModule(builder.Build());
+  auto fusion = computation_->CreateFusionInstruction(
+      {add, broadcast}, HloInstruction::FusionKind::kLoop);
+  RunAnalysis();
+
+  EXPECT_TRUE(points_to_analysis_->CanShareOperandBufferWithUser(param0, {},
+                                                                 fusion, {}));
+  EXPECT_FALSE(points_to_analysis_->CanShareOperandBufferWithUser(param1, {},
+                                                                  fusion, {}));
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From 1b84c3446a0030ea1a8d386c559d90b8f78cf5df Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 11 Jun 2018 07:07:58 -0700
Subject: [PATCH 532/610] Enable overloading of the slice read and write
 operations.

PiperOrigin-RevId: 200046308
---
 tensorflow/contrib/autograph/converters/BUILD | 12 +++
 .../contrib/autograph/converters/slices.py    | 83 +++++++++++++++++++
 .../autograph/converters/slices_test.py       | 59 +++++++++++++
 .../contrib/autograph/impl/conversion.py      |  5 ++
 4 files changed, 159 insertions(+)
 create mode 100644 tensorflow/contrib/autograph/converters/slices.py
 create mode 100644 tensorflow/contrib/autograph/converters/slices_test.py

diff --git a/tensorflow/contrib/autograph/converters/BUILD b/tensorflow/contrib/autograph/converters/BUILD
index 8f9bffa55e..284ad84be5 100644
--- a/tensorflow/contrib/autograph/converters/BUILD
+++ b/tensorflow/contrib/autograph/converters/BUILD
@@ -31,6 +31,7 @@ py_library(
         "name_scopes.py",
         "side_effect_guards.py",
         "single_return.py",
+        "slices.py",
     ],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:__subpackages__"],
@@ -208,3 +209,14 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "slices_test",
+    srcs = ["slices_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":test_lib",
+        "//tensorflow/contrib/autograph/pyct",
+        "//tensorflow/python:client_testlib",
+    ],
+)
diff --git a/tensorflow/contrib/autograph/converters/slices.py b/tensorflow/contrib/autograph/converters/slices.py
new file mode 100644
index 0000000000..85aeda9c41
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/slices.py
@@ -0,0 +1,83 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Converter for slice operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gast
+
+from tensorflow.contrib.autograph.pyct import anno
+from tensorflow.contrib.autograph.pyct import templates
+from tensorflow.contrib.autograph.pyct import transformer
+
+
+class SliceTransformer(transformer.Base):
+  """Converts slicing operations to their TF counterpart.
+
+  Currently, relying on the default slice operator that Tensor uses is
+  insufficient, because TensorArray and tensor lists use dedicated index read
+  and write functions.
+  """
+
+  def _process_single_assignment(self, target, value):
+    if not isinstance(target, gast.Subscript):
+      return None
+
+    template = """
+      target = ag__.set_item(target, key, item)
+    """
+    return templates.replace(
+        template, target=target.value, key=target.slice, item=value)
+
+  def visit_Assign(self, node):
+    node = self.generic_visit(node)
+    # TODO(mdan): Support unpackings and multiple assignments.
+    if len(node.targets) != 1:
+      raise NotImplementedError('multiple assignment')
+    replacement = self._process_single_assignment(node.targets[0], node.value)
+    if replacement is not None:
+      return replacement
+    return node
+
+  def visit_Subscript(self, node):
+    node = self.generic_visit(node)
+    if not isinstance(node.slice, gast.Index):
+      # TODO(mdan): It might make more sense to wave them through.
+      raise NotImplementedError('non-index slice')
+
+    if not isinstance(node.ctx, gast.Load):
+      # Index writes are handled at a higher level, one at which the rvalue is
+      # also available.
+      return node
+
+    dtype = anno.getanno(
+        node.value,
+        'element_type',
+        default=templates.replace_as_expression('None'))
+
+    template = """
+      ag__.get_item(
+          target,
+          key,
+          opts=ag__.GetItemOpts(element_dtype=dtype))
+    """
+    return templates.replace_as_expression(
+        template, target=node.value, key=node.slice, dtype=dtype)
+
+
+def transform(node, context):
+  return SliceTransformer(context).visit(node)
diff --git a/tensorflow/contrib/autograph/converters/slices_test.py b/tensorflow/contrib/autograph/converters/slices_test.py
new file mode 100644
index 0000000000..6c2d7e1ea1
--- /dev/null
+++ b/tensorflow/contrib/autograph/converters/slices_test.py
@@ -0,0 +1,59 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for slices module."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.autograph import utils
+from tensorflow.contrib.autograph.converters import converter_test_base
+from tensorflow.contrib.autograph.converters import slices
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import list_ops
+from tensorflow.python.platform import test
+
+
+class SliceTest(converter_test_base.TestCase):
+
+  def test_index_access(self):
+
+    def test_fn(l):
+      utils.set_element_type(l, dtypes.int32)
+      return l[1]
+
+    node = self.parse_and_analyze(
+        test_fn,
+        {
+            'utils': utils,
+            'dtypes': dtypes
+        },
+        include_type_analysis=True,
+    )
+    node = slices.transform(node, self.ctx)
+
+    with self.compiled(node, dtypes.int32) as result:
+      result.utils = utils
+      result.dtypes = dtypes
+      with self.test_session() as sess:
+        tl = list_ops.tensor_list_from_tensor(
+            [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
+        y = result.test_fn(tl)
+        self.assertEqual(2, sess.run(y))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/autograph/impl/conversion.py b/tensorflow/contrib/autograph/impl/conversion.py
index 55a30dc127..7802bbbe27 100644
--- a/tensorflow/contrib/autograph/impl/conversion.py
+++ b/tensorflow/contrib/autograph/impl/conversion.py
@@ -38,6 +38,7 @@ from tensorflow.contrib.autograph.converters import logical_expressions
 from tensorflow.contrib.autograph.converters import name_scopes
 from tensorflow.contrib.autograph.converters import side_effect_guards
 from tensorflow.contrib.autograph.converters import single_return
+from tensorflow.contrib.autograph.converters import slices
 from tensorflow.contrib.autograph.impl import config
 from tensorflow.contrib.autograph.impl import naming
 from tensorflow.contrib.autograph.pyct import ast_util
@@ -371,6 +372,8 @@ def node_to_graph(node, ctx, nocompile_decorators):
   # TODO(mdan): Clean this up.
   # Some intermediate analyses are not required, and some comments got orphaned.
 
+  # TODO(mdan): We may assume all converters require analysis to be re-done.
+
   # Past this point, line numbers are no longer accurate so we ignore the
   # source.
   # TODO(mdan): Is it feasible to reconstruct intermediate source code?
@@ -393,6 +396,8 @@ def node_to_graph(node, ctx, nocompile_decorators):
 
   node = _static_analysis_pass(node, ctx)
   node = lists.transform(node, ctx)
+  node = _static_analysis_pass(node, ctx)
+  node = slices.transform(node, ctx)
   node = builtin_functions.transform(node, ctx)
 
   node = _static_analysis_pass(node, ctx)
-- 
GitLab


From 56104e275348c377f765c49dc677c0a34440d5c5 Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Mon, 11 Jun 2018 07:08:28 -0700
Subject: [PATCH 533/610] [XLA] Simplify lowering of kIsFinite

We used something notionally equivalent to "(x == x) && abs(x) != inf" to
implement kIsFinite. However, using an ordered comparison against infinity will
return false for NaN inputs as well which obviates the need to explicitly test
for NaN.

PiperOrigin-RevId: 200046365
---
 tensorflow/compiler/xla/service/elemental_ir_emitter.cc | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 9a8bab353e..93fea7ead7 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -456,17 +456,15 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
                                     llvm::ConstantFP::get(type, 1.0)));
     }
     case HloOpcode::kIsFinite: {
-      // (x == x) && abs(x) != inf
+      // abs(x) o!= inf, this works because the comparison returns false if
+      // either operand is NaN.
       auto type = operand_value->getType();
-      auto equal_self =
-          ir_builder_->CreateFCmpOEQ(operand_value, operand_value);
       auto abs_value = llvm_ir::EmitCallToIntrinsic(
           llvm::Intrinsic::fabs, {operand_value}, {type}, ir_builder_);
       auto infinity = llvm::ConstantFP::getInfinity(type);
       auto not_infinite = ir_builder_->CreateFCmpONE(abs_value, infinity);
-      auto result_i1 = ir_builder_->CreateAnd(equal_self, not_infinite);
       return ir_builder_->CreateZExt(
-          result_i1, llvm_ir::PrimitiveTypeToIrType(PRED, module_));
+          not_infinite, llvm_ir::PrimitiveTypeToIrType(PRED, module_));
     }
     case HloOpcode::kNegate:
       return ir_builder_->CreateFNeg(operand_value);
-- 
GitLab


From 01c27242128a55aa4aaf47c674642dd950beda1d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 09:16:31 -0700
Subject: [PATCH 534/610] Add interim runtime utility function for use during
 refactoring out of Dims.

PiperOrigin-RevId: 200061346
---
 tensorflow/contrib/lite/kernels/internal/types.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 0c7fb7a76a..1086c5b092 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -142,6 +142,22 @@ class RuntimeShape {
   };
 };
 
+// Converts inference-style shape to legacy tflite::Dims<4>.
+inline tflite::Dims<4> ToRuntimeDims(const tflite::RuntimeShape& array_shape) {
+  tflite::Dims<4> result;
+  const int dimensions_count = array_shape.DimensionsCount();
+  TFLITE_CHECK_LE(dimensions_count, 4);
+  int cum_prod = 1;
+  for (int i = 0; i < 4; i++) {
+    const int new_dim =
+        (i < dimensions_count) ? array_shape.Dims(dimensions_count - 1 - i) : 1;
+    result.sizes[i] = new_dim;
+    result.strides[i] = cum_prod;
+    cum_prod *= new_dim;
+  }
+  return result;
+}
+
 // Gets next index to iterate through a multidimensional array.
 inline bool NextIndex(const int num_dims, const int* dims, int* current) {
   TFLITE_DCHECK_GT(num_dims, 0);
-- 
GitLab


From a30d1f063f15b6c013eb4ef847da116538851a8c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 09:57:41 -0700
Subject: [PATCH 535/610] Remove Bayesflow/Distribution/Bijector docs.

These docs are out of date.

PiperOrigin-RevId: 200066984
---
 .../bayesflow/python/ops/monte_carlo.py       |  5 +-
 tensorflow/contrib/distributions/__init__.py  |  2 -
 .../python/contrib.bayesflow.monte_carlo.md   | 50 -----------
 .../python/contrib.distributions.bijectors.md | 32 -------
 .../python/contrib.distributions.md           | 83 -------------------
 5 files changed, 1 insertion(+), 171 deletions(-)
 delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
 delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
 delete mode 100644 tensorflow/docs_src/api_guides/python/contrib.distributions.md

diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
index 5770bcdd70..68fa415eea 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo.py
@@ -12,10 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Monte Carlo integration and helpers.
-
-See the @{$python/contrib.bayesflow.monte_carlo} guide.
-"""
+"""Monte Carlo integration and helpers."""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/distributions/__init__.py b/tensorflow/contrib/distributions/__init__.py
index 802538ba97..5cec93c4df 100644
--- a/tensorflow/contrib/distributions/__init__.py
+++ b/tensorflow/contrib/distributions/__init__.py
@@ -13,8 +13,6 @@
 # limitations under the License.
 # ==============================================================================
 """Classes representing statistical distributions and ops for working with them.
-
-See the @{$python/contrib.distributions} guide.
 """
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md b/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
deleted file mode 100644
index 74fe4a323a..0000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.bayesflow.monte_carlo.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# BayesFlow Monte Carlo (contrib)
-[TOC]
-
-Monte Carlo integration and helpers.
-
-## Background
-
-Monte Carlo integration refers to the practice of estimating an expectation with
-a sample mean.  For example, given random variable Z in \\(R^k\\) with density `p`,
-the expectation of function `f` can be approximated like:
-
-$$E_p[f(Z)] = \int f(z) p(z) dz$$
-$$          ~ S_n
-          := n^{-1} \sum_{i=1}^n f(z_i),  z_i\ iid\ samples\ from\ p.$$
-
-If \\(E_p[|f(Z)|] < infinity\\), then \\(S_n\\) --> \\(E_p[f(Z)]\\) by the strong law of large
-numbers.  If \\(E_p[f(Z)^2] < infinity\\), then \\(S_n\\) is asymptotically normal with
-variance \\(Var[f(Z)] / n\\).
-
-Practitioners of Bayesian statistics often find themselves wanting to estimate
-\\(E_p[f(Z)]\\) when the distribution `p` is known only up to a constant.  For
-example, the joint distribution `p(z, x)` may be known, but the evidence
-\\(p(x) = \int p(z, x) dz\\) may be intractable.  In that case, a parameterized
-distribution family \\(q_\lambda(z)\\) may be chosen, and the optimal \\(\lambda\\) is the
-one minimizing the KL divergence between \\(q_\lambda(z)\\) and
-\\(p(z | x)\\).  We only know `p(z, x)`, but that is sufficient to find \\(\lambda\\).
-
-
-## Log-space evaluation and subtracting the maximum
-
-Care must be taken when the random variable lives in a high dimensional space.
-For example, the naive importance sample estimate \\(E_q[f(Z) p(Z) / q(Z)]\\)
-involves the ratio of two terms \\(p(Z) / q(Z)\\), each of which must have tails
-dropping off faster than \\(O(|z|^{-(k + 1)})\\) in order to have finite integral.
-This ratio would often be zero or infinity up to numerical precision.
-
-For that reason, we write
-
-$$Log E_q[ f(Z) p(Z) / q(Z) ]$$
-$$   = Log E_q[ \exp\{Log[f(Z)] + Log[p(Z)] - Log[q(Z)] - C\} ] + C,$$  where
-$$C := Max[ Log[f(Z)] + Log[p(Z)] - Log[q(Z)] ].$$
-
-The maximum value of the exponentiated term will be 0.0, and the expectation
-can be evaluated in a stable manner.
-
-## Ops
-
-*   @{tf.contrib.bayesflow.monte_carlo.expectation}
-*   @{tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler}
-*   @{tf.contrib.bayesflow.monte_carlo.expectation_importance_sampler_logspace}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
deleted file mode 100644
index e169897f31..0000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.bijectors.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Random variable transformations (contrib)
-[TOC]
-
-Bijector Ops.
-
-An API for invertible, differentiable transformations of random variables.
-
-## Background
-
-Differentiable, bijective transformations of continuous random variables alter
-the calculations made in the cumulative/probability distribution functions and
-sample function.  This module provides a standard interface for making these
-manipulations.
-
-For more details and examples, see the `Bijector` docstring.
-
-To apply a `Bijector`, use `distributions.TransformedDistribution`.
-
-## Bijectors
-
-*   @{tf.contrib.distributions.bijectors.Affine}
-*   @{tf.contrib.distributions.bijectors.AffineLinearOperator}
-*   @{tf.contrib.distributions.bijectors.Bijector}
-*   @{tf.contrib.distributions.bijectors.Chain}
-*   @{tf.contrib.distributions.bijectors.CholeskyOuterProduct}
-*   @{tf.contrib.distributions.bijectors.Exp}
-*   @{tf.contrib.distributions.bijectors.Identity}
-*   @{tf.contrib.distributions.bijectors.Inline}
-*   @{tf.contrib.distributions.bijectors.Invert}
-*   @{tf.contrib.distributions.bijectors.PowerTransform}
-*   @{tf.contrib.distributions.bijectors.SoftmaxCentered}
-*   @{tf.contrib.distributions.bijectors.Softplus}
diff --git a/tensorflow/docs_src/api_guides/python/contrib.distributions.md b/tensorflow/docs_src/api_guides/python/contrib.distributions.md
deleted file mode 100644
index 533d7dac13..0000000000
--- a/tensorflow/docs_src/api_guides/python/contrib.distributions.md
+++ /dev/null
@@ -1,83 +0,0 @@
-# Statistical Distributions (contrib)
-[TOC]
-
-Classes representing statistical distributions and ops for working with them.
-
-## Classes for statistical distributions
-
-Classes that represent batches of statistical distributions.  Each class is
-initialized with parameters that define the distributions.
-
-## Base classes
-
-*   @{tf.contrib.distributions.ReparameterizationType}
-*   @{tf.contrib.distributions.Distribution}
-
-## Univariate (scalar) distributions
-
-*   @{tf.contrib.distributions.Binomial}
-*   @{tf.contrib.distributions.Bernoulli}
-*   @{tf.contrib.distributions.Beta}
-*   @{tf.contrib.distributions.Categorical}
-*   @{tf.contrib.distributions.Chi2}
-*   @{tf.contrib.distributions.Chi2WithAbsDf}
-*   @{tf.contrib.distributions.Exponential}
-*   @{tf.contrib.distributions.Gamma}
-*   @{tf.contrib.distributions.InverseGamma}
-*   @{tf.contrib.distributions.Laplace}
-*   @{tf.contrib.distributions.LaplaceWithSoftplusScale}
-*   @{tf.contrib.distributions.Normal}
-*   @{tf.contrib.distributions.NormalWithSoftplusScale}
-*   @{tf.contrib.distributions.Poisson}
-*   @{tf.contrib.distributions.StudentT}
-*   @{tf.contrib.distributions.StudentTWithAbsDfSoftplusScale}
-*   @{tf.contrib.distributions.Uniform}
-
-## Multivariate distributions
-
-### Multivariate normal
-
-*   @{tf.contrib.distributions.MultivariateNormalDiag}
-*   @{tf.contrib.distributions.MultivariateNormalTriL}
-*   @{tf.contrib.distributions.MultivariateNormalDiagPlusLowRank}
-*   @{tf.contrib.distributions.MultivariateNormalDiagWithSoftplusScale}
-
-### Other multivariate distributions
-
-*   @{tf.contrib.distributions.Dirichlet}
-*   @{tf.contrib.distributions.DirichletMultinomial}
-*   @{tf.contrib.distributions.Multinomial}
-*   @{tf.contrib.distributions.WishartCholesky}
-*   @{tf.contrib.distributions.WishartFull}
-
-### Multivariate Utilities
-
-*   @{tf.contrib.distributions.matrix_diag_transform}
-
-## Transformed distributions
-
-*   @{tf.contrib.distributions.TransformedDistribution}
-*   @{tf.contrib.distributions.QuantizedDistribution}
-
-## Mixture Models
-
-*   @{tf.contrib.distributions.Mixture}
-
-## Posterior inference with conjugate priors
-
-Functions that transform conjugate prior/likelihood pairs to distributions
-representing the posterior or posterior predictive.
-
-## Normal likelihood with conjugate prior
-
-*   @{tf.contrib.distributions.normal_conjugates_known_scale_posterior}
-*   @{tf.contrib.distributions.normal_conjugates_known_scale_predictive}
-
-## Kullback-Leibler Divergence
-
-*   @{tf.contrib.distributions.kl_divergence}
-*   @{tf.contrib.distributions.RegisterKL}
-
-## Utilities
-
-*   @{tf.contrib.distributions.softplus_inverse}
-- 
GitLab


From 59259fd74a7cdf766b54e1de00abae88438d1978 Mon Sep 17 00:00:00 2001
From: Dan Moldovan <mdan@google.com>
Date: Mon, 11 Jun 2018 10:12:35 -0700
Subject: [PATCH 536/610] Introducing a directives module, to contain marker
 functions such as set_element_type, set_loop_options and others. To replace
 their counterparts in utils.

PiperOrigin-RevId: 200069544
---
 tensorflow/contrib/autograph/__init__.py      |  6 +-
 tensorflow/contrib/autograph/impl/BUILD       |  1 +
 .../contrib/autograph/impl/directives.py      | 68 +++++++++++++++++++
 3 files changed, 74 insertions(+), 1 deletion(-)
 create mode 100644 tensorflow/contrib/autograph/impl/directives.py

diff --git a/tensorflow/contrib/autograph/__init__.py b/tensorflow/contrib/autograph/__init__.py
index 310eb34a70..637e49c082 100644
--- a/tensorflow/contrib/autograph/__init__.py
+++ b/tensorflow/contrib/autograph/__init__.py
@@ -29,6 +29,8 @@ from tensorflow.contrib.autograph.impl.api import do_not_convert
 from tensorflow.contrib.autograph.impl.api import RunMode
 from tensorflow.contrib.autograph.impl.api import to_code
 from tensorflow.contrib.autograph.impl.api import to_graph
+from tensorflow.contrib.autograph.impl.directives import set_element_type
+from tensorflow.contrib.autograph.impl.directives import set_loop_options
 from tensorflow.contrib.autograph.impl.special_functions import stack
 from tensorflow.contrib.autograph.pyct.transformer import AutographParseError
 from tensorflow.python.util.all_util import remove_undocumented
@@ -41,7 +43,9 @@ _allowed_symbols = [
     'do_not_convert',
     'to_code',
     'to_graph',
-    # Special functions
+    # Special functions and directives
+    'set_element_type',
+    'set_loop_options',
     'stack',
     # Exceptions
     'AutographParseError',
diff --git a/tensorflow/contrib/autograph/impl/BUILD b/tensorflow/contrib/autograph/impl/BUILD
index 91ae0b9b82..02f16ae187 100644
--- a/tensorflow/contrib/autograph/impl/BUILD
+++ b/tensorflow/contrib/autograph/impl/BUILD
@@ -20,6 +20,7 @@ py_library(
         "api.py",
         "config.py",
         "conversion.py",
+        "directives.py",
         "naming.py",
         "special_functions.py",
     ],
diff --git a/tensorflow/contrib/autograph/impl/directives.py b/tensorflow/contrib/autograph/impl/directives.py
new file mode 100644
index 0000000000..aabe5d9939
--- /dev/null
+++ b/tensorflow/contrib/autograph/impl/directives.py
@@ -0,0 +1,68 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Directives are special no-op functions that serve as compilation markers.
+
+They provide static information like type hints, compilation and TensorFlow
+overrides.
+
+These serve as annotations in the compiled code, allowing the user some control
+over the compilation process. They have no functional role at runtime.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+UNSPECIFIED = object()
+
+
+def set_element_type(entity, dtype, shape=UNSPECIFIED):
+  """Indicates that the entity is expected hold items of specified type/shape.
+
+  The staged TensorFlow ops will reflect and assert this data type. Ignored
+  otherwise.
+
+  Args:
+    entity: The entity to annotate.
+    dtype: TensorFlow dtype value to assert for entity.
+    shape: Optional shape to assert for entity.
+  """
+  del entity
+  del dtype
+  del shape
+
+
+def set_loop_options(
+    parallel_iterations=UNSPECIFIED,
+    back_prop=UNSPECIFIED,
+    swap_memory=UNSPECIFIED,
+    maximum_iterations=UNSPECIFIED):
+  """Specifies additional arguments to be passed to the enclosing while_loop.
+
+  The parameters apply to and only to the immediately enclosing loop. It only
+  has effect if the loop is staged as a TF while_loop; otherwise the parameters
+  have no effect.
+
+  Args:
+    parallel_iterations: See tf.while_loop.
+    back_prop: See tf.while_loop.
+    swap_memory: See tf.while_loop.
+    maximum_iterations: See tf.while_loop.
+  """
+  del parallel_iterations
+  del back_prop
+  del swap_memory
+  del maximum_iterations
-- 
GitLab


From 7b8c64ef05c7fdddb3f3a32fd3189e1e4b7e8985 Mon Sep 17 00:00:00 2001
From: Yunxing Dai <yunxing@google.com>
Date: Mon, 11 Jun 2018 10:26:40 -0700
Subject: [PATCH 537/610] Remove dead code to use a map in BatchnormExpander

PiperOrigin-RevId: 200072055
---
 .../xla/service/batchnorm_expander.cc         | 97 ++-----------------
 .../compiler/xla/service/batchnorm_expander.h |  7 +-
 .../compiler/xla/service/cpu/cpu_compiler.cc  |  3 +-
 .../compiler/xla/service/gpu/gpu_compiler.cc  |  3 +-
 4 files changed, 12 insertions(+), 98 deletions(-)

diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index a9f4aead59..ec13fadbc7 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -58,8 +58,7 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
 
   // Runs the visitor on a computation.
   static bool Run(HloComputation* computation, bool rewrite_training_op,
-                  bool rewrite_inference_op, bool rewrite_grad_op,
-                  bool use_map_instructions);
+                  bool rewrite_inference_op, bool rewrite_grad_op);
 
   // Returns whether any batch norm ops were rewritten.
   const bool changed() const { return changed_; }
@@ -70,22 +69,14 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   explicit BatchNormExpanderVisitor(HloComputation* computation,
                                     bool rewrite_training_op,
                                     bool rewrite_inference_op,
-                                    bool rewrite_grad_op,
-                                    bool use_map_instructions)
+                                    bool rewrite_grad_op)
       : computation_(computation),
         rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
-        rewrite_grad_op_(rewrite_grad_op),
-        use_map_instructions_(use_map_instructions) {}
+        rewrite_grad_op_(rewrite_grad_op) {}
 
   HloComputation* GetOrCreateScalarAddComputation(
       PrimitiveType primitive_type) {
-    HloComputation** scalar_add_computation =
-        &scalar_add_computations_[primitive_type];
-    if (*scalar_add_computation) {
-      return *scalar_add_computation;
-    }
-
     HloComputation::Builder b("scalar_add_computation");
     Shape shape = ShapeUtil::MakeShape(primitive_type, {});
     auto scalar_lhs = b.AddInstruction(
@@ -94,44 +85,13 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
         HloInstruction::CreateParameter(1, shape, "scalar_rhs"));
     auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
         shape, HloOpcode::kAdd, scalar_lhs, scalar_rhs));
-    *scalar_add_computation =
-        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
-    return *scalar_add_computation;
-  }
-
-  // TODO(b/80534766): Remove maps after performance issues with scalar
-  // broadcasts are resolved on all backends.
-  HloComputation* GetOrCreateScalarRsqrtComputation(
-      PrimitiveType primitive_type) {
-    HloComputation** scalar_rsqrt_computation =
-        &scalar_rsqrt_computations_[primitive_type];
-    if (*scalar_rsqrt_computation) {
-      return *scalar_rsqrt_computation;
-    }
-
-    HloComputation::Builder b("scalar_add_computation");
-    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
-    auto scalar_lhs = b.AddInstruction(
-        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
-    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateConvert(
-        shape, b.AddInstruction(HloInstruction::CreateConstant(
-                   Literal::CreateR0<float>(-0.5f)))));
-    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
-        shape, HloOpcode::kPower, scalar_lhs, scalar_rhs));
-    *scalar_rsqrt_computation =
-        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
-    return *scalar_rsqrt_computation;
+    return computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
   }
 
   std::unique_ptr<HloInstruction> Rsqrt(
       HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    if (use_map_instructions_) {
-      return HloInstruction::CreateMap(
-          operand->shape(), {operand},
-          GetOrCreateScalarRsqrtComputation(operand->shape().element_type()));
-    }
     HloInstruction* exponent = add_instruction(HloInstruction::CreateBroadcast(
         operand->shape(),
         add_instruction(HloInstruction::CreateConvert(
@@ -143,40 +103,10 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
                                         operand, exponent);
   }
 
-  HloComputation* GetOrCreateScalarMeanComputation(PrimitiveType primitive_type,
-                                                   int64 element_count) {
-    HloComputation** scalar_mean_computation =
-        &scalar_mean_computations_[std::pair<PrimitiveType, int64>(
-            primitive_type, element_count)];
-    if (*scalar_mean_computation) {
-      return *scalar_mean_computation;
-    }
-
-    HloComputation::Builder b("scalar_add_computation");
-    Shape shape = ShapeUtil::MakeShape(primitive_type, {});
-    auto scalar_lhs = b.AddInstruction(
-        HloInstruction::CreateParameter(0, shape, "scalar_lhs"));
-    auto scalar_rhs = b.AddInstruction(HloInstruction::CreateConvert(
-        shape, b.AddInstruction(
-                   HloInstruction::CreateConstant(Literal::CreateR0<float>(
-                       1.0f / static_cast<float>(element_count))))));
-    auto scalar_op = b.AddInstruction(HloInstruction::CreateBinary(
-        shape, HloOpcode::kMultiply, scalar_lhs, scalar_rhs));
-    *scalar_mean_computation =
-        computation_->parent()->AddEmbeddedComputation(b.Build(scalar_op));
-    return *scalar_mean_computation;
-  }
-
   std::unique_ptr<HloInstruction> Mean(
       int64 element_count, HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    if (use_map_instructions_) {
-      return HloInstruction::CreateMap(
-          operand->shape(), {operand},
-          GetOrCreateScalarMeanComputation(operand->shape().element_type(),
-                                           element_count));
-    }
     HloInstruction* elem_count_recip =
         add_instruction(HloInstruction::CreateBroadcast(
             operand->shape(),
@@ -218,18 +148,9 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   bool rewrite_training_op_;
   bool rewrite_inference_op_;
   bool rewrite_grad_op_;
-  bool use_map_instructions_;
 
   // Whether rewrite has occurred.
   bool changed_ = false;
-
-  // Cached computations for adding two scalars.
-  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
-      scalar_add_computations_;
-  tensorflow::gtl::FlatMap<PrimitiveType, HloComputation*>
-      scalar_rsqrt_computations_;
-  tensorflow::gtl::FlatMap<std::pair<PrimitiveType, int64>, HloComputation*>
-      scalar_mean_computations_;
 };
 
 }  // namespace
@@ -237,14 +158,12 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
 bool BatchNormExpanderVisitor::Run(HloComputation* computation,
                                    bool rewrite_training_op,
                                    bool rewrite_inference_op,
-                                   bool rewrite_grad_op,
-                                   bool use_map_instructions) {
+                                   bool rewrite_grad_op) {
   BatchNormExpanderVisitor visitor(
       computation,
       /*rewrite_training_op=*/rewrite_training_op,
       /*rewrite_inference_op=*/rewrite_inference_op,
-      /*rewrite_grad_op=*/rewrite_grad_op,
-      /*use_map_instructions=*/use_map_instructions);
+      /*rewrite_grad_op=*/rewrite_grad_op);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
@@ -668,8 +587,8 @@ StatusOr<bool> BatchNormExpander::Run(HloModule* module) {
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
     if (BatchNormExpanderVisitor::Run(comp, rewrite_training_op_,
-                                      rewrite_inference_op_, rewrite_grad_op_,
-                                      use_map_instructions_)) {
+                                      rewrite_inference_op_,
+                                      rewrite_grad_op_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.h b/tensorflow/compiler/xla/service/batchnorm_expander.h
index 8826636416..7ae202c583 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.h
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.h
@@ -31,12 +31,10 @@ class BatchNormExpander : public HloPassInterface {
   // When use_fusion is set, a multi-output fusion node is created.
   BatchNormExpander(bool rewrite_training_op = false,
                     bool rewrite_inference_op = false,
-                    bool rewrite_grad_op = false,
-                    bool use_map_instructions = false)
+                    bool rewrite_grad_op = false)
       : rewrite_training_op_(rewrite_training_op),
         rewrite_inference_op_(rewrite_inference_op),
-        rewrite_grad_op_(rewrite_grad_op),
-        use_map_instructions_(use_map_instructions) {}
+        rewrite_grad_op_(rewrite_grad_op) {}
   ~BatchNormExpander() = default;
   tensorflow::StringPiece name() const override { return "batchnorm_expander"; }
 
@@ -48,7 +46,6 @@ class BatchNormExpander : public HloPassInterface {
   bool rewrite_training_op_;
   bool rewrite_inference_op_;
   bool rewrite_grad_op_;
-  bool use_map_instructions_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index d6b7b7d2d8..4c0e189e78 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -264,8 +264,7 @@ Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
     pass.AddPass<BatchNormExpander>(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
-        /*rewrite_grad_op=*/true,
-        /*use_map_instructions=*/false);
+        /*rewrite_grad_op=*/true);
     pass.AddPass<AlgebraicSimplifier>(
         /*is_layout_sensitive=*/false,
         [](const Shape&, const Shape&) { return false; },
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index cc33847c5c..afefc740d7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -163,8 +163,7 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
       pass.AddPass<BatchNormExpander>(
           /*rewrite_training_op=*/true,
           /*rewrite_inference_op=*/true,
-          /*rewrite_grad_op=*/true,
-          /*use_map_instructions=*/false);
+          /*rewrite_grad_op=*/true);
 
       // Rewrite gather ops into smaller ones.
       pass.AddPass<GatherExpander>();
-- 
GitLab


From 6aeab4f5402f56e4b30540db0847256362c15e32 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 11 Jun 2018 10:42:15 -0700
Subject: [PATCH 538/610] Don't call back into python during insert (which will
 leave the set in a broken condition if the runtime decides to let another
 thread run).

Thank you for finding the bug. The watched_variables_ set should not really require a lock since all our functions hold the GIL (verified by looking at the generated SWIG). The reason that there was a concurrent access to the set is that the insert was calling back into python (which might release the GIL and let another thread run, which will also attempt to insert a variable and break the set).

I included the lock to be safe though, since its non-trivial to verify without looking at the generated swig wrappers that the GIL is held.

PiperOrigin-RevId: 200074843
---
 tensorflow/contrib/distribute/python/BUILD |  1 -
 tensorflow/python/eager/pywrap_tfe_src.cc  | 82 ++++++++++++----------
 2 files changed, 43 insertions(+), 40 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 9624abd199..b572512bbb 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -312,7 +312,6 @@ cuda_py_test(
     tags = [
         "multi_and_single_gpu",
         "no_pip",
-        "noguitar",  # TODO(b/109653107): test is flaky.
     ],
 )
 
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index e3ce0ef9d0..52b3268903 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -873,22 +873,6 @@ static tensorflow::DataType FastTensorDtype(PyObject* tensor) {
   return static_cast<tensorflow::DataType>(id);
 }
 
-static tensorflow::int64 FastHandleId(PyObject* variable) {
-  PyObject* handle = PyObject_GetAttrString(variable, "handle");
-  if (handle == nullptr) {
-    return -1;
-  }
-  tensorflow::int64 id = FastTensorId(handle);
-  Py_DECREF(handle);
-  return id;
-}
-
-struct CompareByHandleId {
-  bool operator()(PyObject* lhs, PyObject* rhs) {
-    return FastHandleId(lhs) < FastHandleId(rhs);
-  }
-};
-
 class GradientTape
     : public tensorflow::eager::GradientTape<PyObject, PyBackwardFunction> {
  public:
@@ -897,35 +881,63 @@ class GradientTape
             persistent) {}
 
   virtual ~GradientTape() {
-    for (PyObject* v : watched_variables_) {
-      Py_DECREF(v);
+    for (const IdAndVariable& v : watched_variables_) {
+      Py_DECREF(v.variable);
     }
   }
 
   void WatchVariable(PyObject* v) {
-    auto insert_result = watched_variables_.insert(v);
-    if (insert_result.second) {
-      // Only increment the reference count if we aren't already watching this
-      // variable.
-      Py_INCREF(v);
-    }
-    PyObject* handle = PyObject_GetAttrString(v, "handle");
+    tensorflow::Safe_PyObjectPtr handle(PyObject_GetAttrString(v, "handle"));
     if (handle == nullptr) {
       return;
     }
-    tensorflow::int64 id = FastTensorId(handle);
-    Py_DECREF(handle);
+    tensorflow::int64 id = FastTensorId(handle.get());
+
     if (!PyErr_Occurred()) {
       this->Watch(id);
     }
+
+    tensorflow::mutex_lock l(watched_variables_mu_);
+    auto insert_result = watched_variables_.emplace(id, v);
+
+    if (insert_result.second) {
+      // Only increment the reference count if we aren't already watching this
+      // variable.
+      Py_INCREF(v);
+    }
   }
 
-  const std::set<PyObject*, CompareByHandleId> WatchedVariables() {
-    return watched_variables_;
+  PyObject* GetVariablesAsPyTuple() {
+    tensorflow::mutex_lock l(watched_variables_mu_);
+    PyObject* result = PyTuple_New(watched_variables_.size());
+    Py_ssize_t pos = 0;
+    for (const IdAndVariable& id_and_variable : watched_variables_) {
+      PyTuple_SET_ITEM(result, pos++, id_and_variable.variable);
+      Py_INCREF(id_and_variable.variable);
+    }
+    return result;
   }
 
  private:
-  std::set<PyObject*, CompareByHandleId> watched_variables_;
+  // We store an IdAndVariable in the map since the map needs to be locked
+  // during insert, but should not call back into python during insert to avoid
+  // deadlocking with the GIL.
+  struct IdAndVariable {
+    tensorflow::int64 id;
+    PyObject* variable;
+
+    IdAndVariable(tensorflow::int64 id, PyObject* variable)
+        : id(id), variable(variable) {}
+  };
+  struct CompareById {
+    bool operator()(const IdAndVariable& lhs, const IdAndVariable& rhs) {
+      return lhs.id < rhs.id;
+    }
+  };
+
+  tensorflow::mutex watched_variables_mu_;
+  std::set<IdAndVariable, CompareById> watched_variables_
+      GUARDED_BY(watched_variables_mu_);
 };
 
 typedef struct {
@@ -1217,15 +1229,7 @@ void TFE_Py_TapeSetWatchVariable(PyObject* variable) {
 }
 
 PyObject* TFE_Py_TapeWatchedVariables(PyObject* tape) {
-  const auto& watched_variables =
-      reinterpret_cast<TFE_Py_Tape*>(tape)->tape->WatchedVariables();
-  PyObject* result = PyTuple_New(watched_variables.size());
-  Py_ssize_t pos = 0;
-  for (PyObject* variable : watched_variables) {
-    PyTuple_SET_ITEM(result, pos++, variable);
-    Py_INCREF(variable);
-  }
-  return result;
+  return reinterpret_cast<TFE_Py_Tape*>(tape)->tape->GetVariablesAsPyTuple();
 }
 
 namespace {
-- 
GitLab


From 20a8e604e33bacb85e39c8ad0b1f8b101b230ef7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 10:59:30 -0700
Subject: [PATCH 539/610] CostGraphDef has been modified to keep track of the
 accuracy of the cost estimation.

PiperOrigin-RevId: 200078367
---
 tensorflow/core/framework/cost_graph.proto                  | 3 +++
 tensorflow/core/grappler/costs/analytical_cost_estimator.cc | 1 +
 2 files changed, 4 insertions(+)

diff --git a/tensorflow/core/framework/cost_graph.proto b/tensorflow/core/framework/cost_graph.proto
index 19d765cd32..cc6bc84d69 100644
--- a/tensorflow/core/framework/cost_graph.proto
+++ b/tensorflow/core/framework/cost_graph.proto
@@ -69,6 +69,9 @@ message CostGraphDef {
 
     // Ids of the control inputs for this node.
     repeated int32 control_input = 8;
+
+    // Are the costs inaccurate?
+    bool inaccurate = 17;
   }
   repeated Node node = 1;
 }
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index c8ba4dfbda..a60e3c7a9f 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -98,6 +98,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
           node_costs.compute_time.asMicroSeconds().count());
       cost_node->set_memory_time(
           node_costs.memory_time.asMicroSeconds().count());
+      cost_node->set_inaccurate(node_costs.inaccurate);
       for (const auto& output : op_context.op_info.outputs()) {
         auto output_info = cost_node->add_output_info();
         output_info->set_dtype(output.dtype());
-- 
GitLab


From 530dc71d0487cacccbe270490d460bc401040dc9 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 11:01:33 -0700
Subject: [PATCH 540/610] Fix tsan detected error in
 core/util/exec_on_stall_test.cc

Enforce mutex around access to test variable.

PiperOrigin-RevId: 200078751
---
 tensorflow/core/util/exec_on_stall_test.cc | 23 ++++++++++++++++------
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/tensorflow/core/util/exec_on_stall_test.cc b/tensorflow/core/util/exec_on_stall_test.cc
index df8118d611..42e66a7e84 100644
--- a/tensorflow/core/util/exec_on_stall_test.cc
+++ b/tensorflow/core/util/exec_on_stall_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/exec_on_stall.h"
 
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
@@ -32,14 +33,24 @@ Chunk* NewChunk(int stall_seconds, std::function<void()> f) {
 }
 
 TEST(ExecuteOnStallTest, BothWays) {
-  bool a_triggered = false;
-  bool b_triggered = false;
-  Chunk* a = NewChunk(1, [&a_triggered]() { a_triggered = true; });
-  Chunk* b = NewChunk(1, [&b_triggered]() { b_triggered = true; });
+  mutex mu;
+  bool a_triggered(false);
+  bool b_triggered(false);
+  Chunk* a = NewChunk(1, [&mu, &a_triggered]() {
+    mutex_lock l(mu);
+    a_triggered = true;
+  });
+  Chunk* b = NewChunk(1, [&mu, &b_triggered]() {
+    mutex_lock l(mu);
+    b_triggered = true;
+  });
   delete a;
   Env::Default()->SleepForMicroseconds(2000000);
-  EXPECT_FALSE(a_triggered);
-  EXPECT_TRUE(b_triggered);
+  {
+    mutex_lock l(mu);
+    EXPECT_FALSE(a_triggered);
+    EXPECT_TRUE(b_triggered);
+  }
   delete b;
 }
 
-- 
GitLab


From b5e7264395f1791d682b85463285d7933efda9c2 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Mon, 11 Jun 2018 11:03:57 -0700
Subject: [PATCH 541/610] Remove a few redundant benchmark parameters.

PiperOrigin-RevId: 200079299
---
 .../contrib/lite/tools/benchmark/README.md    |  4 --
 .../tools/benchmark/benchmark_tflite_model.cc | 50 +------------------
 .../tools/benchmark/benchmark_tflite_model.h  |  4 --
 3 files changed, 1 insertion(+), 57 deletions(-)

diff --git a/tensorflow/contrib/lite/tools/benchmark/README.md b/tensorflow/contrib/lite/tools/benchmark/README.md
index 2788f76faf..c10826afff 100644
--- a/tensorflow/contrib/lite/tools/benchmark/README.md
+++ b/tensorflow/contrib/lite/tools/benchmark/README.md
@@ -46,8 +46,6 @@ adb shell /data/local/tmp/benchmark_model \
   --graph=/data/local/tmp/mobilenet_quant_v1_224.tflite \
   --input_layer="Placeholder" \
   --input_layer_shape="1,224,224,3" \
-  --input_layer_type="uint8" \
-  --output_layer="MobilenetV1/Predictions/Reshape_1" \
   --num_threads=4
 ```
 
@@ -66,8 +64,6 @@ bazel-bin/tensorflow/contrib/lite/tools/benchmark/benchmark_model \
   --graph=mobilenet_quant_v1_224.tflite \
   --input_layer="Placeholder" \
   --input_layer_shape="1,224,224,3" \
-  --input_layer_type="uint8" \
-  --output_layer="MobilenetV1/Predictions/Reshape_1" \
   --num_threads=4
 ```
 
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
index 2e5b866273..5f803cec19 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -123,29 +123,11 @@ void FillRandomString(tflite::DynamicBuffer* buffer,
   }
 }
 
-TfLiteType TfLiteTypeFromString(const string& input_layer_type) {
-  if (input_layer_type == "string")
-    return kTfLiteString;
-  else if (input_layer_type == "float")
-    return kTfLiteFloat32;
-  else if (input_layer_type == "uint8")
-    return kTfLiteUInt8;
-  else if (input_layer_type == "int32")
-    return kTfLiteInt32;
-  else if (input_layer_type == "int64")
-    return kTfLiteInt64;
-  else
-    return kTfLiteNoType;
-}
-
 bool PopulateInputLayerInfo(
     const string& names_string, const string& shapes_string,
-    const string& types_string, const string& values_string,
     std::vector<BenchmarkTfLiteModel::InputLayerInfo>* info) {
   std::vector<std::string> names = Split(names_string, ',');
   std::vector<std::string> shapes = Split(shapes_string, ':');
-  std::vector<std::string> types = Split(types_string, ',');
-  std::vector<std::string> values = Split(values_string, ':');
 
   if (names.size() != shapes.size()) {
     TFLITE_LOG(ERROR) << "The number of items in"
@@ -158,17 +140,6 @@ bool PopulateInputLayerInfo(
                       << " --input_layer_shape=1,224,224,4:1,20";
     return false;
   }
-  if (names.size() != types.size()) {
-    TFLITE_LOG(ERROR) << "The number of items in"
-                      << " --input_layer_type (" << types_string << ", with "
-                      << types.size() << " items)"
-                      << " must match the number of items in"
-                      << " --input_layer (" << names_string << ", with "
-                      << names.size() << " items)."
-                      << " For example --input_layer=input1,input2"
-                      << " --input_layer_type=float,int";
-    return false;
-  }
 
   for (int i = 0; i < names.size(); ++i) {
     info->push_back(BenchmarkTfLiteModel::InputLayerInfo());
@@ -176,10 +147,6 @@ bool PopulateInputLayerInfo(
 
     input.name = names[i];
 
-    input.data_type = TfLiteTypeFromString(types[i]);
-    TFLITE_BENCHMARK_CHECK(input.data_type != kTfLiteNoType)
-        << types[i] << " was an invalid type";
-
     TFLITE_BENCHMARK_CHECK(SplitAndParse(shapes[i], ',', &input.shape))
         << "Incorrect size string specified: " << shapes[i];
     for (int dim : input.shape) {
@@ -190,12 +157,6 @@ bool PopulateInputLayerInfo(
         return false;
       }
     }
-
-    if (i < values.size()) {
-      TFLITE_BENCHMARK_CHECK(
-          SplitAndParse(values[i], ',', &input.initialization_values))
-          << "Incorrect initialization values string specified: " << values[i];
-    }
   }
 
   return true;
@@ -209,10 +170,6 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
       Flag("graph", &graph, "graph file name"),
       Flag("input_layer", &input_layer_string, "input layer names"),
       Flag("input_layer_shape", &input_layer_shape_string, "input layer shape"),
-      Flag("input_layer_type", &input_layer_type_string, "input layer type"),
-      Flag("input_layer_values", &input_layer_values_string,
-           "values to initialize the inputs with"),
-      Flag("output_layer", &output_layer_string, "output layer name"),
       Flag("use_nnapi", &use_nnapi, "use nnapi api")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
@@ -224,8 +181,6 @@ void BenchmarkTfLiteModel::LogFlags() {
   TFLITE_LOG(INFO) << "Graph: [" << graph << "]";
   TFLITE_LOG(INFO) << "Input layers: [" << input_layer_string << "]";
   TFLITE_LOG(INFO) << "Input shapes: [" << input_layer_shape_string << "]";
-  TFLITE_LOG(INFO) << "Input types: [" << input_layer_type_string << "]";
-  TFLITE_LOG(INFO) << "Output layers: [" << output_layer_string << "]";
   TFLITE_LOG(INFO) << "Use nnapi : [" << use_nnapi << "]";
 }
 
@@ -236,8 +191,7 @@ bool BenchmarkTfLiteModel::ValidateFlags() {
     return false;
   }
   return PopulateInputLayerInfo(input_layer_string, input_layer_shape_string,
-                                input_layer_type_string,
-                                input_layer_values_string, &inputs);
+                                &inputs);
 }
 
 uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
@@ -293,8 +247,6 @@ void BenchmarkTfLiteModel::Init() {
     TFLITE_BENCHMARK_CHECK_EQ(t->name, input.name)
         << "Tensor # " << i << " is named " << t->name << " but flags call it "
         << input.name;
-    TFLITE_BENCHMARK_CHECK_EQ(t->type, input.data_type)
-        << "Could not match the type of input tensor " << t->name;
   }
 
   // Resize all non-string tensors.
diff --git a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
index e70f6de1bf..ffb93da964 100644
--- a/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/contrib/lite/tools/benchmark/benchmark_tflite_model.h
@@ -64,10 +64,7 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
 
   struct InputLayerInfo {
     std::string name;
-    TfLiteType data_type;
     std::vector<int> shape;
-    // Note that initialization_values is currently unused.
-    std::vector<float> initialization_values;
   };
 
  private:
@@ -78,7 +75,6 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   std::string input_layer_type_string;
   std::string input_layer_shape_string;
   std::string input_layer_values_string;
-  std::string output_layer_string;
   std::vector<InputLayerInfo> inputs;
   bool use_nnapi;
   ProfilingListener profiling_listener_;
-- 
GitLab


From a4c77fd06d215af6f8fbd2c9bca561092c73d79e Mon Sep 17 00:00:00 2001
From: David Majnemer <majnemer@google.com>
Date: Mon, 11 Jun 2018 11:05:13 -0700
Subject: [PATCH 542/610] [XLA] Make Log1p & Expm1 available through python

PiperOrigin-RevId: 200079654
---
 .../compiler/xla/python/local_computation_builder.cc |  2 ++
 .../compiler/xla/python/local_computation_builder.h  |  2 ++
 .../compiler/xla/python/local_computation_builder.i  |  2 ++
 tensorflow/compiler/xla/python/xla_client.py         |  2 ++
 tensorflow/compiler/xla/python/xla_client_test.py    | 12 ++++++++++++
 5 files changed, 20 insertions(+)

diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index f808990cad..ac058feccd 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -598,10 +598,12 @@ _FORWARD_BINOP(Or)
 _FORWARD_UNOP(Not)
 _FORWARD_UNOP(Abs)
 _FORWARD_UNOP(Exp)
+_FORWARD_UNOP(Expm1)
 _FORWARD_UNOP(Floor)
 _FORWARD_UNOP(Ceil)
 _FORWARD_UNOP(Round)
 _FORWARD_UNOP(Log)
+_FORWARD_UNOP(Log1p)
 _FORWARD_UNOP(Sign)
 _FORWARD_UNOP(Cos)
 _FORWARD_UNOP(Sin)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 9ac13b6523..e30c7790b9 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -305,10 +305,12 @@ class LocalComputationBuilder {
   _FORWARD_UNOP(Not)
   _FORWARD_UNOP(Abs)
   _FORWARD_UNOP(Exp)
+  _FORWARD_UNOP(Expm1)
   _FORWARD_UNOP(Floor)
   _FORWARD_UNOP(Ceil)
   _FORWARD_UNOP(Round)
   _FORWARD_UNOP(Log)
+  _FORWARD_UNOP(Log1p)
   _FORWARD_UNOP(Sign)
   _FORWARD_UNOP(Cos)
   _FORWARD_UNOP(Sin)
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index 536b93c6f9..fcd30b6c2f 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -974,10 +974,12 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::LocalComputationBuilder::Not;
 %unignore xla::swig::LocalComputationBuilder::Abs;
 %unignore xla::swig::LocalComputationBuilder::Exp;
+%unignore xla::swig::LocalComputationBuilder::Expm1;
 %unignore xla::swig::LocalComputationBuilder::Floor;
 %unignore xla::swig::LocalComputationBuilder::Ceil;
 %unignore xla::swig::LocalComputationBuilder::Round;
 %unignore xla::swig::LocalComputationBuilder::Log;
+%unignore xla::swig::LocalComputationBuilder::Log1p;
 %unignore xla::swig::LocalComputationBuilder::Sign;
 %unignore xla::swig::LocalComputationBuilder::Cos;
 %unignore xla::swig::LocalComputationBuilder::Sin;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 11611ac612..8b03682892 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -89,10 +89,12 @@ _UNARY_OPS = [
     'Not',
     'Abs',
     'Exp',
+    'Expm1',
     'Floor',
     'Round',
     'Ceil',
     'Log',
+    'Log1p',
     'Sign',
     'Cos',
     'Sin',
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 375e720f9b..6c0680f443 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -571,6 +571,12 @@ class SingleOpTest(LocalComputationTest):
     c.Exp(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=np.exp(arr))
 
+  def testExpm1(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Expm1(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.expm1(arr))
+
   def testRound(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
@@ -583,6 +589,12 @@ class SingleOpTest(LocalComputationTest):
     c.Log(c.Constant(arr))
     self._ExecuteAndCompareClose(c, expected=np.log(arr))
 
+  def testLog1p(self):
+    c = self._NewComputation()
+    arr = NumpyArrayF32([3.3, 12.1])
+    c.Log1p(c.Constant(arr))
+    self._ExecuteAndCompareClose(c, expected=np.log1p(arr))
+
   def testNeg(self):
     c = self._NewComputation()
     arr = NumpyArrayF32([3.3, 12.1])
-- 
GitLab


From 81682566acf8ea5b5691a9e36d7740953e3c7ef7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 11:07:28 -0700
Subject: [PATCH 543/610] Add link to TFlite's supported models table and some
 copyedits

PiperOrigin-RevId: 200080095
---
 tensorflow/docs_src/mobile/tflite/index.md | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/tensorflow/docs_src/mobile/tflite/index.md b/tensorflow/docs_src/mobile/tflite/index.md
index 5622034827..3d1733024e 100644
--- a/tensorflow/docs_src/mobile/tflite/index.md
+++ b/tensorflow/docs_src/mobile/tflite/index.md
@@ -37,8 +37,9 @@ a custom (less-dynamic) memory allocator to ensure minimal load, initialization,
 and execution latency.
 
 TensorFlow Lite provides an interface to leverage hardware acceleration, if
-available on the device. It does so via the Android Neural Networks library,
-released as part of Android O-MR1.
+available on the device. It does so via the
+[Android Neural Networks API](https://developer.android.com/ndk/guides/neuralnetworks/index.html),
+available on Android 8.1 (API level 27) and higher.
 
 ## Why do we need a new mobile-specific library?
 
@@ -116,6 +117,10 @@ following:
       Wear](https://research.googleblog.com/2017/02/on-device-machine-intelligence.html)
       to all first-party and third-party apps.
 
+    Also see the complete list of
+    [TensorFlow Lite's supported models](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/lite/g3doc/models.md),
+    including the model sizes, performance numbers, and downloadable model files.
+
 - Quantized versions of the MobileNet model, which runs faster than the
   non-quantized (float) version on CPU.
 
@@ -131,10 +136,10 @@ compatibility with this release.
 ## Getting Started
 
 We recommend you try out TensorFlow Lite with the pre-tested models indicated
-above. If you have an existing mode, you will need to test whether your model is
-compatible with both the converter and the supported operator set.  To test your
-model, see the [documentation on
-GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite).
+above. If you have an existing model, you will need to test whether your model
+is compatible with both the converter and the supported operator set.  To test
+your model, see the
+[documentation on GitHub](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/lite).
 
 ### Retrain Inception-V3 or MobileNet for a custom data set
 
-- 
GitLab


From c73cd1afce146aa2559cafa4ac72fe638db43860 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 11:43:45 -0700
Subject: [PATCH 544/610] [TF:XLA] Small performance tweaks for
 tf.random_shuffle, but still too slow.

PiperOrigin-RevId: 200086551
---
 .../compiler/tf2xla/kernels/random_ops.cc     | 30 ++++---------------
 1 file changed, 6 insertions(+), 24 deletions(-)

diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index ebac5c4396..105be38fe2 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -76,32 +76,14 @@ class RandomShuffleOp : public XlaOpKernel {
       ctx->SetOutput(0, input);
     } else {
       // Generate the random swaps for the indices.
-      auto zero = builder->Broadcast(
-          builder->ConstantLiteral(xla::Literal::Zero(xla::S32)),
-          gtl::ArraySlice<int64>({n}));
-      auto n_maxval = builder->Broadcast(builder->ConstantR0<int32>(n),
-                                         gtl::ArraySlice<int64>({n}));
       auto swaps_shape = xla::ShapeUtil::MakeShape(xla::S32, {n});
-      auto swaps = builder->RngUniform(zero, n_maxval, swaps_shape);
+      auto swaps =
+          builder->RngUniform(builder->ConstantR0<int32>(0),
+                              builder->ConstantR0<int32>(n), swaps_shape);
 
       // Generate range(n) as the initial value for the indices to be swapped.
-      auto index_init_body_fn = [&](xla::XlaOp i,
-                                    gtl::ArraySlice<xla::XlaOp> loop_vars,
-                                    xla::XlaBuilder* builder)
-          -> xla::StatusOr<std::vector<xla::XlaOp>> {
-        auto indices = loop_vars[0];
-        i = builder->Reshape(i, {}, {1});
-        // indices[i] = i
-        indices = builder->DynamicUpdateSlice(indices, i, i);
-        return std::vector<xla::XlaOp>{indices};
-      };
-      // for i in range(n):
-      xla::XlaOp index_zeros = Zeros(builder, swaps_shape);
-      auto index_init_loop_result =
-          XlaForEachIndex(n, xla::S32, index_init_body_fn, {index_zeros},
-                          "index_init_loop", builder)
-              .ValueOrDie();
-      auto indices = index_init_loop_result[0];
+      xla::XlaOp indices;
+      TF_CHECK_OK(XlaHelpers::Iota(builder, DataType::DT_INT32, n, &indices));
 
       // Swap the indices at i and swaps[i].
       auto swap_body_fn = [&](xla::XlaOp i,
@@ -110,7 +92,7 @@ class RandomShuffleOp : public XlaOpKernel {
           -> xla::StatusOr<std::vector<xla::XlaOp>> {
         auto swaps = loop_vars[0];
         auto indices = loop_vars[1];
-        i = builder->Reshape(i, {}, {1});
+        i = builder->Reshape(i, {1});
         // temp = indices[i]
         auto temp = builder->DynamicSlice(indices, i, {1});
         // swap_index = swaps[i]
-- 
GitLab


From 68d7bcaa52a2b3307e805e2c8512a8dc47fd3272 Mon Sep 17 00:00:00 2001
From: Blake Hechtman <blakehechtman@google.com>
Date: Mon, 11 Jun 2018 11:44:49 -0700
Subject: [PATCH 545/610] [XLA] Fold consecutive reduces.

PiperOrigin-RevId: 200086761
---
 .../xla/service/algebraic_simplifier.cc       | 31 +++++++++++++++
 .../xla/service/algebraic_simplifier_test.cc  | 38 +++++++++++++++++++
 2 files changed, 69 insertions(+)

diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index dc5f1b31bf..3b36939b8a 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -1783,6 +1783,37 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* reduce) {
     return ReplaceWithNewInstruction(
         reduce, HloInstruction::CreateReshape(reduce->shape(), arg));
   }
+
+  // If a reduce feeds a reduce with the same computation and initial value,
+  // they can be combined into a single reduce.
+  if (arg->opcode() == HloOpcode::kReduce &&
+      init_value->Identical(*arg->operand(1)) &&
+      *function == *arg->to_apply()) {
+    // Create a new reduce with the combined reduction dimensions of both
+    // reduces.
+    std::vector<int64> arg_dims = arg->dimensions();
+    std::sort(arg_dims.begin(), arg_dims.end());
+    std::vector<int64> reduce_dims = reduce->dimensions();
+    std::sort(reduce_dims.begin(), reduce_dims.end());
+    // Transform reduce_dims to the same rank as the operand of the operand.
+    for (int64 arg_dim : arg_dims) {
+      for (int64& dim : reduce_dims) {
+        if (dim >= arg_dim) {
+          ++dim;
+        }
+      }
+    }
+    std::vector<int64> new_dimensions;
+    new_dimensions.reserve(arg->dimensions().size() +
+                           reduce->dimensions().size());
+    std::merge(arg_dims.begin(), arg_dims.end(), reduce_dims.begin(),
+               reduce_dims.end(), std::back_inserter(new_dimensions));
+    return ReplaceWithNewInstruction(
+        reduce,
+        HloInstruction::CreateReduce(reduce->shape(), arg->mutable_operand(0),
+                                     init_value, new_dimensions, function));
+  }
+
   // A reshape that collapses multiple dimensions into a dimension being
   // reduced can just reduce all of those dimensions instead of doing a
   // collapsing reshape before a reduction.
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index 27eb48181e..2605b0488c 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -74,6 +74,44 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   EXPECT_EQ(root, param0);
 }
 
+// Test that Reduce(Reduce(A)) -> Reduce(A)
+TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
+  HloComputation::Builder builder(TestName());
+  // Create add computation.
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
+  HloComputation* add_computation = nullptr;
+  {
+    HloComputation::Builder builder(TestName() + ".add");
+    const Shape scalar_shape = ShapeUtil::MakeShape(F32, {});
+    HloInstruction* p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, scalar_shape, "p0"));
+    HloInstruction* p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, scalar_shape, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(scalar_shape, HloOpcode::kAdd, p0, p1));
+    add_computation = module().AddEmbeddedComputation(builder.Build());
+  }
+  Shape r4f32 = ShapeUtil::MakeShape(F32, {4, 5, 6, 7});
+  HloInstruction* param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r4f32, "param"));
+  std::vector<int64> dims0({0});
+  Shape r3f32 = ShapeUtil::MakeShape(F32, {5, 6, 7});
+  HloInstruction* reduce0 = builder.AddInstruction(
+      HloInstruction::CreateReduce(r3f32, param, zero, dims0, add_computation));
+  std::vector<int64> dims1({1, 2});
+  Shape r1f32 = ShapeUtil::MakeShape(F32, {5});
+  builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
+                                                      dims1, add_computation));
+  module().AddEntryComputation(builder.Build());
+  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
+                                 non_bitcasting_callback());
+  ASSERT_TRUE(simplifier.Run(&module()).ValueOrDie());
+  HloInstruction* root = module().entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Reduce(param, zero));
+  EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
+}
+
 // Test that Const + A is canonicalized to A + Const.
 TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   Shape r0f32 = ShapeUtil::MakeShape(F32, {});
-- 
GitLab


From 719da533b716fd14291229909b8f19092cebe21d Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 11:45:52 -0700
Subject: [PATCH 546/610] Add missing ` in docstring that led to misformatted
 documentation.

PiperOrigin-RevId: 200086945
---
 tensorflow/python/ops/custom_gradient.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index d934f27cb9..ca24f11054 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -89,7 +89,7 @@ def custom_gradient(f):
          operations in `f` to `x`.
        - `grad_fn` is a function with the signature `g(*grad_ys)` which returns
          a list of `Tensor`s - the derivatives of `Tensor`s in `y` with respect
-         to the `Tensor`s in `x.  `grad_ys` is a `Tensor` or sequence of
+         to the `Tensor`s in `x`.  `grad_ys` is a `Tensor` or sequence of
          `Tensor`s the same size as `y` holding the initial value gradients for
          each `Tensor` in `y`. If `f` uses `Variable`s (that are not part of the
          inputs), i.e. through `get_variable`, then `grad_fn` should have
-- 
GitLab


From ff72c6d36e6d02da88ee1cdef4c573cb2577a09e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 11:49:26 -0700
Subject: [PATCH 547/610] [TF:XLA] Small clean up, removing unused variable in
 the Cholesky implementation.

PiperOrigin-RevId: 200087647
---
 tensorflow/compiler/tf2xla/lib/cholesky.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index 3f1384bc86..20925118bf 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -110,7 +110,6 @@ xla::StatusOr<xla::XlaOp> CholeskyUnblocked(xla::XlaBuilder* builder,
         FloatLiteral(body_builder, a_shape.element_type(), 0.5));
 
     // a[..., i+1:, i]
-    auto ip1 = body_builder->Add(i, body_builder->ConstantR0<int32>(1));
     // select the whole i-th column, then mask out all rows above i+1
     TF_ASSIGN_OR_RETURN(
         auto a_0i, DynamicSliceInMinorDims(body_builder, body_a, {i}, {1}));
-- 
GitLab


From 9eef81aeeff86192dfcb1e9b7758bcece00a9b1d Mon Sep 17 00:00:00 2001
From: Igor Ganichev <iga@google.com>
Date: Mon, 11 Jun 2018 11:50:11 -0700
Subject: [PATCH 548/610] Implement Shape and friends as a direct XLA kernels

PiperOrigin-RevId: 200087766
---
 tensorflow/compiler/jit/BUILD                 |  1 +
 tensorflow/compiler/jit/xla_device_ops.h      | 41 +++++++++++
 tensorflow/compiler/tests/eager_test.py       | 71 +++++++++++++++++++
 .../compiler/tf2xla/kernels/shape_op.cc       | 15 ++--
 4 files changed, 120 insertions(+), 8 deletions(-)

diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index e2b614d91b..51a79e2cd9 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -181,6 +181,7 @@ cc_library(
         "//tensorflow/core/kernels:no_op",
         "//tensorflow/core/kernels:resource_variable_ops",
         "//tensorflow/core/kernels:sendrecv_ops",
+        "//tensorflow/core/kernels:shape_ops",
         "//tensorflow/core/kernels:variable_ops",
     ],
 )
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index 0c49286acd..11e45d2823 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/no_op.h"
 #include "tensorflow/core/kernels/resource_variable_ops.h"
 #include "tensorflow/core/kernels/sendrecv_ops.h"
+#include "tensorflow/core/kernels/shape_ops.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 
 namespace tensorflow {
@@ -87,6 +88,46 @@ class XlaAssignVariableOp : public AsyncOpKernel {
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("ReadVariableOp").Device(DEVICE).HostMemory("resource"),            \
       ReadVariableOp);                                                         \
+  REGISTER_KERNEL_BUILDER(Name("Shape")                                        \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int32>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeOp<int32>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("Shape")                                        \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int64>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeOp<int64>);                                     \
+  REGISTER_KERNEL_BUILDER(Name("ShapeN")                                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int32>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeNOp<int32>);                                    \
+  REGISTER_KERNEL_BUILDER(Name("ShapeN")                                       \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int64>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          ShapeNOp<int64>);                                    \
+  REGISTER_KERNEL_BUILDER(Name("Size")                                         \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int32>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          SizeOp<int32>);                                      \
+  REGISTER_KERNEL_BUILDER(Name("Size")                                         \
+                              .Device(DEVICE)                                  \
+                              .HostMemory("output")                            \
+                              .TypeConstraint<int64>("out_type")               \
+                              .TypeConstraint("T", TYPES),                     \
+                          SizeOp<int64>);                                      \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name("Rank").Device(DEVICE).HostMemory("output").TypeConstraint("T",     \
+                                                                      TYPES),  \
+      RankOp);                                                                 \
   REGISTER_KERNEL_BUILDER(                                                     \
       Name("AssignVariableOp").Device(DEVICE).HostMemory("resource"),          \
       XlaAssignVariableOp);                                                    \
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 4dff5f0f40..fceb61ef87 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -160,6 +160,77 @@ class EagerTest(XLATestCase):
       for _ in range(100):
         values.append(var.value())
 
+  # The shape, shape_n, size, and rank are tested here because their
+  # execution kernels (as opposed to compilation only tf2xla kernels)
+  # are distincts from tf2xla kernels.
+
+  def testShape(self):
+    def const(value):
+      return array_ops.shape(
+          constant_op.constant(value)).numpy()
+
+    def ones(value):
+      return array_ops.shape(
+          array_ops.ones(value)).numpy()
+
+    with self.test_scope():
+      # Shapes of directly constructed tensors
+      self.assertAllEqual([], const(3))
+      self.assertAllEqual([3], const([1.0, 2.0, 3.0]))
+      self.assertAllEqual([2, 2], const([[1.0, 2.0], [3.0, 4.0]]))
+      self.assertAllEqual([2, 1, 2], const([[[1.0, 2.0]], [[3.0, 4.0]]]))
+
+      # Shapes of tensors created by op running on device
+      # We make this distinction because directly constructed tensors
+      # are treated differently in a few places that can influence shape:
+      #  - they always have on_host_tensor
+      #  - they and their shapes can be cached
+      #  - they end up on device via a copy, instead of as program output
+      self.assertAllEqual([], ones([]))
+      self.assertAllEqual([3], ones([3]))
+      self.assertAllEqual([2, 2], ones([2, 2]))
+      self.assertAllEqual([2, 1, 2], ones([2, 1, 2]))
+
+  def testShapeN(self):
+    with self.test_scope():
+      # Shapes of directly constructed tensors
+      shapes = array_ops.shape_n([
+          constant_op.constant(1.0),
+          constant_op.constant([1.0, 2.0, 3.0]),
+          constant_op.constant([[1.0, 2.0], [3.0, 4.0]])])
+      self.assertAllEqual(
+          [[], [3], [2, 2]],
+          [x.numpy().tolist() for x in shapes])
+
+      # Shapes of tensors created by op running on device
+      shapes = array_ops.shape_n([
+          array_ops.ones([]),
+          array_ops.ones([3]),
+          array_ops.ones([2, 2])])
+      self.assertAllEqual(
+          [[], [3], [2, 2]],
+          [x.numpy().tolist() for x in shapes])
+
+  def testSize(self):
+    with self.test_scope():
+      self.assertEqual(
+          1, array_ops.size(constant_op.constant(1.0)).numpy())
+      self.assertEqual(
+          3, array_ops.size(constant_op.constant([1.0, 2.0, 3.0])).numpy())
+      self.assertEqual(
+          4, array_ops.size(
+              constant_op.constant([[1.0, 2.0], [3.0, 4.0]])).numpy())
+
+  def testRank(self):
+    with self.test_scope():
+      self.assertEqual(
+          0, array_ops.rank(constant_op.constant(1.0)).numpy())
+      self.assertEqual(
+          1, array_ops.rank(constant_op.constant([1.0, 2.0, 3.0])).numpy())
+      self.assertEqual(
+          2, array_ops.rank(
+              constant_op.constant([[1.0, 2.0], [3.0, 4.0]])).numpy())
+
 
 class EagerFunctionTest(XLATestCase):
 
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 05354bca5b..d59720bef7 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -43,7 +43,7 @@ class ShapeOp : public XlaOpKernel {
   DataType out_dtype_;
 };
 
-REGISTER_XLA_OP(Name("Shape"), ShapeOp);
+REGISTER_XLA_OP(Name("Shape").CompilationOnly(), ShapeOp);
 
 class ShapeNOp : public XlaOpKernel {
  public:
@@ -65,7 +65,7 @@ class ShapeNOp : public XlaOpKernel {
  private:
   DataType out_dtype_;
 };
-REGISTER_XLA_OP(Name("ShapeN"), ShapeNOp);
+REGISTER_XLA_OP(Name("ShapeN").CompilationOnly(), ShapeNOp);
 
 class RankOp : public XlaOpKernel {
  public:
@@ -81,7 +81,7 @@ class RankOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Rank"), RankOp);
+REGISTER_XLA_OP(Name("Rank").CompilationOnly(), RankOp);
 
 class SizeOp : public XlaOpKernel {
  public:
@@ -100,7 +100,7 @@ class SizeOp : public XlaOpKernel {
   }
 };
 
-REGISTER_XLA_OP(Name("Size"), SizeOp);
+REGISTER_XLA_OP(Name("Size").CompilationOnly(), SizeOp);
 
 class ExpandDimsOp : public XlaOpKernel {
  public:
@@ -189,10 +189,9 @@ class SqueezeOp : public XlaOpKernel {
       if (!wrapped_squeeze_dims.empty()) {
         if (wrapped_squeeze_dims.count(i) > 0) {
           OP_REQUIRES(ctx, existing_dim == 1,
-                      errors::InvalidArgument("Tried to explicitly squeeze "
-                                              "dimension ",
-                                              i, " but dimension was not 1: ",
-                                              existing_dim));
+                      errors::InvalidArgument(
+                          "Tried to explicitly squeeze dimension ", i,
+                          " but dimension was not 1: ", existing_dim));
         } else {
           // This dimension is not being squeezed.
           new_shape.push_back(existing_dim);
-- 
GitLab


From e20ccaab7a85d729f37ad4b7b90188e97e2124fa Mon Sep 17 00:00:00 2001
From: Allen Lavoie <allenl@google.com>
Date: Mon, 11 Jun 2018 11:55:34 -0700
Subject: [PATCH 549/610] Use the Keras session for saving/loading in
 TensorFlow format

Fixes issues when there's no default session

PiperOrigin-RevId: 200088574
---
 tensorflow/python/keras/engine/network.py     | 10 +++-
 tensorflow/python/keras/engine/saving_test.py | 52 +++++++++++++------
 2 files changed, 44 insertions(+), 18 deletions(-)

diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index c096669a5f..e7ec237163 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 import copy
+import functools
 import json
 import os
 import weakref
@@ -1300,7 +1301,11 @@ class Network(base_layer.Layer):
       with h5py.File(filepath, 'w') as f:
         saving.save_weights_to_hdf5_group(f, self.layers)
     else:
-      self._checkpointable_saver.save(filepath)
+      if context.executing_eagerly():
+        session = None
+      else:
+        session = backend.get_session()
+      self._checkpointable_saver.save(filepath, session=session)
 
   def load_weights(self, filepath, by_name=False):
     """Loads all layer weights, either from a TensorFlow or an HDF5 weight file.
@@ -1360,7 +1365,8 @@ class Network(base_layer.Layer):
             'loading TensorFlow-formatted weights (got by_name=True to '
             'load_weights).')
       if not context.executing_eagerly():
-        finalizer = status.run_restore_ops
+        session = backend.get_session()
+        finalizer = functools.partial(status.run_restore_ops, session=session)
         if self.built:
           finalizer()
         else:
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index 1470718a5e..6a94986b9c 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -428,26 +428,27 @@ class TestWholeModelSaving(test.TestCase):
       os.remove(fname)
 
   def test_saving_lambda_numpy_array_arguments(self):
-    if h5py is None:
-      self.skipTest('h5py required to run this test')
+    with self.test_session():
+      if h5py is None:
+        self.skipTest('h5py required to run this test')
 
-    mean = np.random.random((4, 2, 3))
-    std = np.abs(np.random.random((4, 2, 3))) + 1e-5
-    inputs = keras.layers.Input(shape=(4, 2, 3))
-    output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
-                                 arguments={'mu': mean, 'std': std})(inputs)
-    model = keras.models.Model(inputs, output)
-    model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
+      mean = np.random.random((4, 2, 3))
+      std = np.abs(np.random.random((4, 2, 3))) + 1e-5
+      inputs = keras.layers.Input(shape=(4, 2, 3))
+      output = keras.layers.Lambda(lambda image, mu, std: (image - mu) / std,
+                                   arguments={'mu': mean, 'std': std})(inputs)
+      model = keras.models.Model(inputs, output)
+      model.compile(loss='mse', optimizer='sgd', metrics=['acc'])
 
-    fd, fname = tempfile.mkstemp('.h5')
-    keras.models.save_model(model, fname)
+      fd, fname = tempfile.mkstemp('.h5')
+      keras.models.save_model(model, fname)
 
-    model = keras.models.load_model(fname)
-    os.close(fd)
-    os.remove(fname)
+      model = keras.models.load_model(fname)
+      os.close(fd)
+      os.remove(fname)
 
-    self.assertAllClose(mean, model.layers[1].arguments['mu'])
-    self.assertAllClose(std, model.layers[1].arguments['std'])
+      self.assertAllClose(mean, model.layers[1].arguments['mu'])
+      self.assertAllClose(std, model.layers[1].arguments['std'])
 
   def test_saving_model_with_long_layer_names(self):
     if h5py is None:
@@ -604,6 +605,25 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         # Indirectly tests that the user is prompted
         model.save_weights(prefix, save_format='tensorflow', overwrite=False)
 
+  def test_no_default_session(self):
+    with ops.Graph().as_default():
+      self.assertFalse(ops.get_default_session())
+      data = np.random.random((1000, 32)).astype(np.float32)
+      labels = np.random.random((1000, 10)).astype(np.float32)
+
+      model = keras.models.Sequential([
+          keras.layers.Dense(10, activation='softmax'),
+          keras.layers.Dense(10, activation='softmax')])
+
+      model.compile(optimizer=training_module.RMSPropOptimizer(0.001),
+                    loss='categorical_crossentropy',
+                    metrics=['accuracy'])
+
+      model.fit(data, labels)
+      fname = os.path.join(self.get_temp_dir(), 'weights', 'ckpt')
+      model.save_weights(fname)
+      model.load_weights(fname)
+
   def test_no_graph_pollution(self):
     with context.graph_mode():
       graph = ops.Graph()
-- 
GitLab


From 1fefd1af5b30bfe6213271da558c5131fd33ce0a Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 11 Jun 2018 11:57:16 -0700
Subject: [PATCH 550/610] [XLA] Allow replay_computation to take an HLO textual
 string as input.

PiperOrigin-RevId: 200088845
---
 tensorflow/compiler/xla/tools/BUILD           |  1 +
 .../compiler/xla/tools/replay_computation.cc  | 52 ++++++++++++++-----
 2 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index ff5340ee3f..e4a052c8f1 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -85,6 +85,7 @@ cc_library(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client/lib:testing",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto",
         "//tensorflow/compiler/xla/service/gpu:infeed_manager",
         "//tensorflow/compiler/xla/tests:test_utils",
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index be094b7890..f7574e0b1c 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -24,6 +24,9 @@ limitations under the License.
 // passing --use_fake_data on the command line.  If the real data is available
 // in the proto and --use_fake_data is false, the real data is used.
 //
+// Input can be a binary HloSnapshot proto, a binary HloProto proto, or a
+// textual HLO string.
+//
 // The output format is:
 //
 // file_path: computation_name :: type:literal_str
@@ -43,6 +46,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/gpu/infeed_manager.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -195,25 +199,45 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
   return std::move(*result_literal);
 }
 
+StatusOr<HloSnapshot> ParseInputFile(const string& filename,
+                                     const Options& opts) {
+  tensorflow::Env* env = tensorflow::Env::Default();
+  HloSnapshot snapshot;
+  if (tensorflow::ReadBinaryProto(env, filename, &snapshot).ok()) {
+    return snapshot;
+  }
+  CHECK(opts.use_fake_data)
+      << "Without --use_fake_data, you must pass an HloSnapshot -- HloProto "
+         "and textual HLO don't carry real data.";
+  fprintf(stderr, "%s: is not HloSnapshot. Trying HloProto.\n",
+          filename.c_str());
+
+  if (tensorflow::ReadBinaryProto(env, filename, snapshot.mutable_hlo()).ok()) {
+    return snapshot;
+  }
+  fprintf(stderr, "%s: is not HloProto. Trying HLO text.\n", filename.c_str());
+  string contents;
+  TF_RETURN_IF_ERROR(tensorflow::ReadFileToString(env, filename, &contents));
+  StatusOr<std::unique_ptr<HloModule>> module = ParseHloString(contents);
+  if (module.ok()) {
+    *snapshot.mutable_hlo()->mutable_hlo_module() =
+        module.ValueOrDie()->ToProto();
+    return snapshot;
+  }
+  fprintf(stderr, "%s: is not HLO text.  Nothing left to try.\n",
+          filename.c_str());
+  return InvalidArgument("Could not parse %s.", filename.c_str());
+}
+
 int RealMain(tensorflow::gtl::ArraySlice<char*> args, const Options& opts) {
   LocalClient* client = ClientLibrary::LocalClientOrDie();
-  tensorflow::Env* env = tensorflow::Env::Default();
   int exit_status = EXIT_SUCCESS;
   for (char* arg : args) {
-    HloSnapshot snapshot;
-    auto status = tensorflow::ReadBinaryProto(env, arg, &snapshot);
-    if (!status.ok()) {
-      fprintf(stderr, "%s: is not HloSnapshot. Trying HloProto.\n", arg);
-      status = tensorflow::ReadBinaryProto(env, arg, snapshot.mutable_hlo());
-      if (!status.ok()) {
-        fprintf(stderr, "%s: is not HloSnapshot or HloProto: %s.\n", arg,
-                status.ToString().c_str());
-        continue;
-      }
-      CHECK(opts.use_fake_data)
-          << "HloProto input must be handled with --use_fake_data";
+    StatusOr<HloSnapshot> maybe_snapshot = ParseInputFile(arg, opts);
+    if (!maybe_snapshot.ok()) {
+      continue;
     }
-
+    HloSnapshot snapshot = std::move(maybe_snapshot).ValueOrDie();
     StatusOr<Literal> result_status = ReplayComputation(snapshot, client, opts);
     if (!result_status.ok()) {
       fprintf(stderr, "%s: error: %s\n", arg,
-- 
GitLab


From 308fe20c728538112cb6ee3c051187977b88773b Mon Sep 17 00:00:00 2001
From: Justin Lebar <jlebar@google.com>
Date: Mon, 11 Jun 2018 12:30:55 -0700
Subject: [PATCH 551/610] [XLA] Inline constants into fusion nodes in graphviz
 dump.

Reduces visual noise, makes it easier to see the *actual* parameters.

PiperOrigin-RevId: 200094095
---
 .../compiler/xla/service/hlo_graph_dumper.cc  | 57 ++++++++++++-------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index cf954001c6..05aab9a2cd 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -723,11 +723,28 @@ string HloDotDumper::DumpRootTag() {
                 to_id, node_body, node_shape, NodeColorAttributes(color));
 }
 
+static const HloInstruction* TryGetFusionParameterConstant(
+    const HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kParameter || !instr->IsFused()) {
+    return nullptr;
+  }
+  const HloInstruction* fusion = instr->parent()->FusionInstruction();
+  const HloInstruction* operand = fusion->operand(instr->parameter_number());
+  if (operand->opcode() == HloOpcode::kConstant) {
+    return operand;
+  }
+  return nullptr;
+}
+
 bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
   // If a node:
   //
-  //  - is a tuple-shaped parameter,
-  //  - is not a parameter to a fusion node,
+  //  - is a parameter of a fusion node which is bound to a constant,
+  //
+  // or
+  //
+  //  - is a tuple-shaped parameter, and
+  //  - is not a parameter to a fusion node, and
   //  - has at least kMinUsersToOmit users shown, and
   //  - all of the shown users are get-tuple-elements,
   //
@@ -735,6 +752,9 @@ bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
   //
   // This helps us handle the common case where a while loop body has one big
   // tuple-shaped parameter.
+  if (TryGetFusionParameterConstant(instr) != nullptr) {
+    return true;
+  }
   const int kMinUsersToOmit = 3;
   return instr->opcode() == HloOpcode::kParameter &&
          ShapeUtil::IsTuple(instr->shape()) && !instr->IsFused() &&
@@ -841,17 +861,6 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
                   ShapeUtil::HumanString(constant->shape()));
   };
 
-  // Special case: If instr is a parameter to a fusion node, check whether the
-  // corresponding operand to the fusion node is a constant.
-  if (instr->opcode() == HloOpcode::kParameter && instr->IsFused()) {
-    const HloInstruction* fusion = instr->parent()->FusionInstruction();
-    const HloInstruction* operand = fusion->operand(instr->parameter_number());
-    if (operand->opcode() != HloOpcode::kConstant) {
-      return "";
-    }
-    return StrCat("<b>constant</b> ", stringify_constant(operand));
-  }
-
   std::vector<string> lines;
   for (int64 i = 0; i < instr->operand_count(); ++i) {
     const HloInstruction* operand = instr->operand(i);
@@ -859,11 +868,18 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
     if (operand->opcode() == HloOpcode::kConstant) {
       operand_str = stringify_constant(operand);
     } else if (ShouldMergeIntoUsers(operand)) {
-      // Special case: If the operand is a parameter, use its parameter number
-      // rather than its name, because that's generally how people think of the
-      // node.
+      // Special case: If the operand is a parameter to a fusion node and it
+      // always has a constant value, display it like a regular constant.
+      //
+      // For other parameters, use the parameter number rather than the proper
+      // name, because that's generally how people think of the node.
       if (operand->opcode() == HloOpcode::kParameter) {
-        operand_str = Printf("Parameter %lld", operand->parameter_number());
+        if (const HloInstruction* constant =
+                TryGetFusionParameterConstant(operand)) {
+          operand_str = stringify_constant(constant);
+        } else {
+          operand_str = Printf("Parameter %lld", operand->parameter_number());
+        }
       } else {
         operand_str = operand->name();
       }
@@ -897,11 +913,14 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
   const auto kParameterColor = kOrange;
 
   // Special case: If this instruction has a parameter merged into it, paint it
-  // the same color as a parameter.
+  // the same color as a parameter.  Unless the merged-in parameter is a
+  // parameter to a fusion node that is bound to a constant -- these aren't
+  // "real" parameters from the user's perspective.
   if (std::any_of(instr->operands().begin(), instr->operands().end(),
                   [&](const HloInstruction* operand) {
                     return operand->opcode() == HloOpcode::kParameter &&
-                           ShouldMergeIntoUsers(operand);
+                           ShouldMergeIntoUsers(operand) &&
+                           TryGetFusionParameterConstant(operand) == nullptr;
                   })) {
     return kParameterColor;
   }
-- 
GitLab


From 32c8013f0ab3feb139648ae759e2d0168fb5dc95 Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Mon, 11 Jun 2018 12:40:54 -0700
Subject: [PATCH 552/610] Check to ensure the Cloud TPU is ready before
 resolving.

PiperOrigin-RevId: 200095692
---
 .../python/training/tpu_cluster_resolver.py   |  4 ++
 .../training/tpu_cluster_resolver_test.py     | 44 +++++++++++++++++++
 2 files changed, 48 insertions(+)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index a5a9630a4a..3a1d90e77d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -256,6 +256,10 @@ class TPUClusterResolver(ClusterResolver):
       request = self._service.projects().locations().nodes().get(name=full_name)
       response = request.execute()
 
+      if 'state' in response and response['state'] != 'READY':
+        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
+                           (self._tpu, response['state']))
+
       if 'health' in response and response['health'] != 'HEALTHY':
         raise RuntimeError('TPU "%s" is unhealthy: "%s"' % (self._tpu,
                                                             response['health']))
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 5fac55fd02..86e9d9ddad 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -158,6 +158,50 @@ class TPUClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testUnhealthyCloudTpu(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'health': 'UNHEALTHY'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    with self.assertRaises(RuntimeError):
+      tpu_cluster_resolver.cluster_spec()
+
+  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+                     mock_request_compute_metadata)
+  def testNotReadyCloudTpu(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'ipAddress': '10.1.2.3',
+            'port': '8470',
+            'state': 'CREATING'
+        }
+    }
+
+    tpu_cluster_resolver = TPUClusterResolver(
+        project=None,
+        zone=None,
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    with self.assertRaises(RuntimeError):
+      tpu_cluster_resolver.cluster_spec()
+
   def testSimpleSuccessfulRetrieval(self):
     tpu_map = {
         'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
-- 
GitLab


From aa7e1b8f9bab47ddbdcae442878d06f4c8562bf9 Mon Sep 17 00:00:00 2001
From: Sanjoy Das <sanjoy@google.com>
Date: Mon, 11 Jun 2018 12:43:42 -0700
Subject: [PATCH 553/610] [TF:XLA] Bump open source llvm revision to r334405

PiperOrigin-RevId: 200096167
---
 tensorflow/workspace.bzl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 4e2f26e097..7df3d6594b 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -451,11 +451,11 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
   tf_http_archive(
       name = "llvm",
       urls = [
-          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/42f7ad099aa73695ea633c585da0a9848d6a730d.tar.gz",
-          "https://github.com/llvm-mirror/llvm/archive/42f7ad099aa73695ea633c585da0a9848d6a730d.tar.gz",
+          "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/582e5dd5553e3089fef97f9ab5a3f063e0160fa9.tar.gz",
+          "https://github.com/llvm-mirror/llvm/archive/582e5dd5553e3089fef97f9ab5a3f063e0160fa9.tar.gz",
       ],
-      sha256 = "3a7f1f9c54b51640ba30e40e7e7698bca152e18510001b5a1ad70e8df45e1b05",
-      strip_prefix = "llvm-42f7ad099aa73695ea633c585da0a9848d6a730d",
+      sha256 = "9a0e63469ae5a546e0c84b778955f0febabfc8497d312324546ec7d0db68430e",
+      strip_prefix = "llvm-582e5dd5553e3089fef97f9ab5a3f063e0160fa9",
       build_file = clean_dep("//third_party/llvm:llvm.BUILD"),
   )
 
-- 
GitLab


From 76fc9882aa5d326cb34d0af5b33410e6805c911f Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 11 Jun 2018 12:45:49 -0700
Subject: [PATCH 554/610] [XLA:GPU] Make (r)sqrt emission look through explicit
 broadcasts.

Found by inspection, performance seems neutral.

PiperOrigin-RevId: 200096482
---
 .../compiler/xla/service/gpu/elemental_ir_emitter.cc   | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index e5e2a0478a..b812dd7d3f 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -53,11 +53,17 @@ using llvm_ir::IrName;
 using llvm_ir::SetToFirstInsertPoint;
 using tensorflow::strings::StrAppend;
 
+namespace {
 // Returns whether operand is a floating-point literal with the given value.
 bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
-  return operand->opcode() == HloOpcode::kConstant &&
-         operand->literal().IsAllFloat(value);
+  if (operand->opcode() == HloOpcode::kConstant &&
+      operand->literal().IsAllFloat(value)) {
+    return true;
+  }
+  return operand->opcode() == HloOpcode::kBroadcast &&
+         IsFPLiteralWithValue(operand->operand(0), value);
 }
+}  // namespace
 
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     const HloModuleConfig& hlo_module_config, llvm::Module* module,
-- 
GitLab


From 1a45b12b86707c55519c18126b1064a0dd006f3e Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Mon, 11 Jun 2018 12:54:47 -0700
Subject: [PATCH 555/610] Copy dimensions array into GroupIterable instead of
 storing pointers to it.

This avoid breakages when passing temporary objects, e.g.
  auto it = sparse_tensor.group({0});
  for (auto _ : it) { /* ... */ }

The API was easy to misuse before and this actually causes test failures when
compiling with a new clang version.

PiperOrigin-RevId: 200097909
---
 tensorflow/core/util/sparse/group_iterator.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tensorflow/core/util/sparse/group_iterator.h b/tensorflow/core/util/sparse/group_iterator.h
index c0fce207e7..fb70318078 100644
--- a/tensorflow/core/util/sparse/group_iterator.h
+++ b/tensorflow/core/util/sparse/group_iterator.h
@@ -78,7 +78,10 @@ class GroupIterable {
   typedef gtl::ArraySlice<int64> VarDimArray;
 
   GroupIterable(Tensor ix, Tensor vals, int dims, const VarDimArray& group_dims)
-      : ix_(ix), vals_(vals), dims_(dims), group_dims_(group_dims) {}
+      : ix_(ix),
+        vals_(vals),
+        dims_(dims),
+        group_dims_(group_dims.begin(), group_dims.end()) {}
 
   class IteratorStep;
 
@@ -127,7 +130,7 @@ class GroupIterable {
   Tensor ix_;
   Tensor vals_;
   const int dims_;
-  const VarDimArray group_dims_;
+  const gtl::InlinedVector<int64, 8> group_dims_;
 };
 
 // Implementation of Group::values<T>()
-- 
GitLab


From 3be426254eb8f0066deb0324c5237786045245c1 Mon Sep 17 00:00:00 2001
From: Skye Wanderman-Milne <skyewm@google.com>
Date: Mon, 11 Jun 2018 13:32:25 -0700
Subject: [PATCH 556/610] Make cond_v2 work with no input tensors.

PiperOrigin-RevId: 200103320
---
 .../contrib/control_flow/python/cond_v2_test.py   | 15 +++++++++++++++
 tensorflow/core/ops/functional_ops.cc             |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/control_flow/python/cond_v2_test.py b/tensorflow/contrib/control_flow/python/cond_v2_test.py
index dcecefb520..338601aa2c 100644
--- a/tensorflow/contrib/control_flow/python/cond_v2_test.py
+++ b/tensorflow/contrib/control_flow/python/cond_v2_test.py
@@ -81,6 +81,21 @@ class NewCondTest(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  def testNoInputs(self):
+    pred = array_ops.placeholder(dtypes.bool, name="pred")
+
+    def true_fn():
+      return constant_op.constant(1.0)
+
+    def false_fn():
+      return constant_op.constant(2.0)
+
+    out = cond_v2.cond_v2(pred, true_fn, false_fn)
+
+    with self.test_session() as sess:
+      self.assertEqual(sess.run(out, {pred: True}), [1.0])
+      self.assertEqual(sess.run(out, {pred: False}), [2.0])
+
   def testSecondDerivative(self):
     pred = array_ops.placeholder(dtypes.bool, name="pred")
     x = constant_op.constant(3.0, name="x")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index a6cc4b60e5..88553dff93 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -82,7 +82,7 @@ REGISTER_OP("If")
     .Input("input: Tin")
     .Output("output: Tout")
     .Attr("Tcond: type")
-    .Attr("Tin: list(type)")
+    .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type)")
     .Attr("then_branch: func")
     .Attr("else_branch: func")
-- 
GitLab


From 0d9b4f06b7242288a3aeb0d29fe10278522c7f45 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 14:10:40 -0700
Subject: [PATCH 557/610] Internal Change.

PiperOrigin-RevId: 200109989
---
 tensorflow/contrib/lite/kernels/fully_connected.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/contrib/lite/kernels/fully_connected.cc b/tensorflow/contrib/lite/kernels/fully_connected.cc
index 989920622d..5a0524bec6 100644
--- a/tensorflow/contrib/lite/kernels/fully_connected.cc
+++ b/tensorflow/contrib/lite/kernels/fully_connected.cc
@@ -105,7 +105,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   const int batch_size = input_size / filter->dims->data[1];
   const int num_units = filter->dims->data[0];
 
-  TF_LITE_ASSERT_EQ(input_size, batch_size * filter->dims->data[1]);
+  TF_LITE_ENSURE_EQ(context, input_size, batch_size * filter->dims->data[1]);
   if (bias) {
     TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
   }
-- 
GitLab


From 21aa82e1a12eb53fe4c94006f957c1adab9aa662 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 14:10:47 -0700
Subject: [PATCH 558/610] [XLA] Sanitize HloComputation and HloInstruction
 names.

PiperOrigin-RevId: 200110003
---
 .../xla/service/buffer_assignment_test.cc     | 38 +++++++++----------
 .../compiler/xla/service/hlo_computation.cc   |  2 +-
 .../xla/service/hlo_graph_dumper_test.cc      |  2 +-
 .../compiler/xla/service/hlo_instruction.cc   |  4 +-
 .../compiler/xla/service/hlo_instruction.h    |  9 ++++-
 .../xla/service/hlo_instruction_test.cc       |  4 +-
 tensorflow/compiler/xla/service/hlo_module.cc |  2 +-
 tensorflow/compiler/xla/service/hlo_parser.cc |  7 +++-
 .../xla/service/transpose_folding_test.cc     |  2 +-
 9 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 7e86c33687..96d25675de 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -371,11 +371,11 @@ TEST_F(BufferAssignmentTest, Basic) {
   // param1[100] --------------/--------/
   auto builder = HloComputation::Builder(TestName());
   auto paramscalar =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, f32vec100_, ""));
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
   auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, f32vec100_, ""));
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
   auto add = builder.AddInstruction(
@@ -418,11 +418,11 @@ TEST_F(BufferAssignmentTest, BasicUniquelyColored) {
   // share anything.
   auto builder = HloComputation::Builder(TestName());
   auto paramscalar =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, f32vec100_, ""));
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
   auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, f32vec100_, ""));
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
   auto add = builder.AddInstruction(
@@ -477,11 +477,11 @@ TEST_F(BufferAssignmentTest, BasicPartiallyColored) {
   // have the color 0, which allows the mul and add to share buffers.
   auto builder = HloComputation::Builder(TestName());
   auto paramscalar =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, f32vec100_, ""));
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
   auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, f32vec100_, ""));
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
   auto add = builder.AddInstruction(
@@ -547,11 +547,11 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
   //
   auto builder = HloComputation::Builder(TestName());
   auto paramscalar =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, f32vec100_, ""));
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
   auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, f32vec100_, ""));
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
   auto add = builder.AddInstruction(
@@ -601,7 +601,7 @@ TEST_F(BufferAssignmentTest, TrivialMap) {
   // Creates the main kernel and verifies instruction counts.
   auto builder = HloComputation::Builder(TestName());
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, f32a100x10_, ""));
+      HloInstruction::CreateParameter(0, f32a100x10_, "p"));
   auto map = builder.AddInstruction(
       HloInstruction::CreateMap(f32a100x10_, {param0}, map_computation));
   module->AddEntryComputation(builder.Build());
@@ -654,7 +654,7 @@ TEST_F(BufferAssignmentTest, CannotReuseInputBufferOfReduce) {
 
   auto builder = HloComputation::Builder(TestName());
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, f32a100x10_, ""));
+      HloInstruction::CreateParameter(0, f32a100x10_, "p"));
   auto exp1 = builder.AddInstruction(
       HloInstruction::CreateUnary(f32a100x10_, HloOpcode::kExp, param0));
   auto exp2 = builder.AddInstruction(
@@ -818,7 +818,7 @@ TEST_F(BufferAssignmentTest, UnaryOpReuseChain) {
   // param0[100] ---> (exp) ---> (tanh) ---> (exp) ---> (neg)
   auto builder = HloComputation::Builder(TestName());
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, f32vec100_, ""));
+      HloInstruction::CreateParameter(0, f32vec100_, "p"));
   auto exp1 = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kExp, param0));
   auto tanh = builder.AddInstruction(
@@ -1496,11 +1496,11 @@ TEST_F(BufferAssignmentTest, TrivialPeakBuffers) {
   // param1[100] --------------/--------/
   auto builder = HloComputation::Builder(TestName());
   auto paramscalar =
-      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, ""));
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, f32vec100_, ""));
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
   auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, f32vec100_, ""));
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_, HloOpcode::kMultiply, paramscalar, param0));
   auto add = builder.AddInstruction(
@@ -1536,7 +1536,7 @@ TEST_F(BufferAssignmentTest, PeakBuffers) {
   // be {%rev, %neg, %concat}. This occurs right at the concat itself.
   auto builder = HloComputation::Builder(TestName());
   auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, f32vec100_, ""));
+      HloInstruction::CreateParameter(0, f32vec100_, "p"));
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(f32vec100_, HloOpcode::kLog, param));
   auto rev = builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index ed0ea39ff5..763d9d2269 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -64,7 +64,7 @@ HloComputation::HloComputation(
     const string& name, int parameter_count,
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
     HloInstruction* root_instruction, HloInstruction* fusion_instruction)
-    : name_(name),
+    : name_(NameUniquer::GetSanitizedName(name)),
       unique_id_(-1),
       root_instruction_(root_instruction),
       fusion_instruction_(fusion_instruction) {
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
index 8e52d926d8..68f41a1cbb 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper_test.cc
@@ -121,7 +121,7 @@ TEST(HloGraphDumperTest, Constant) {
   HloComputation::Builder b("b");
   auto instruction = b.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(-42)));
-  instruction->set_name("i_am_a_constant_root_instruction");
+  instruction->SetAndSanitizeName("i_am_a_constant_root_instruction");
   HloModuleConfig config;
   HloModule m(TestName(), config);
   HloComputation* root_computation = m.AddEntryComputation(b.Build());
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index f0fec77c31..c89d836888 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -231,7 +231,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   }
 
   TF_RET_CHECK(!proto.name().empty());
-  instruction->name_ = proto.name();
+  instruction->SetAndSanitizeName(proto.name());
 
   instruction->metadata_ = proto.metadata();
   instruction->backend_config_ = proto.backend_config();
@@ -295,7 +295,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   auto instruction =
       WrapUnique(new HloInstruction(HloOpcode::kParameter, shape));
   instruction->parameter_number_ = parameter_number;
-  instruction->name_ = name;
+  instruction->SetAndSanitizeName(name);
   return instruction;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 5c5def58d3..ae1c563b56 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -1364,9 +1364,14 @@ class HloInstruction {
   std::tuple<bool, std::vector<int64>, std::vector<int64>>
   ReshapeMerelyInsertsOrDeletes1SizedDimensions() const;
 
-  // Gets/sets the string identifier for this instruction.
+  // Gets the string identifier for this instruction.
   const string& name() const { return name_; }
-  void set_name(tensorflow::StringPiece name) { name_ = std::string(name); }
+
+  // Sets the string identifier for this instruction. Name will be sanitized to
+  // match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  void SetAndSanitizeName(const string& name) {
+    name_ = NameUniquer::GetSanitizedName(name);
+  }
 
   // Use the given NameUniquer to select a unique name for the instruction based
   // on the instruction's existing name.
diff --git a/tensorflow/compiler/xla/service/hlo_instruction_test.cc b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
index 76349c4099..5d6f8b931f 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction_test.cc
@@ -342,7 +342,7 @@ TEST_F(HloInstructionTest, TrivialMap) {
   // Builds a parameter and feeds it to the map.
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, f32a100x10, ""));
+      HloInstruction::CreateParameter(0, f32a100x10, "p"));
   auto map = builder.AddInstruction(
       HloInstruction::CreateMap(f32a100x10, {param0}, add_f32));
   module->AddEntryComputation(builder.Build());
@@ -381,7 +381,7 @@ TEST_F(HloInstructionTest, TrivialReduce) {
   // Builds a parameter and an initial value and feeds them to the reduce.
   HloComputation::Builder builder(TestName());
   auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, f32a100x10, ""));
+      HloInstruction::CreateParameter(0, f32a100x10, "p"));
   auto const0 = builder.AddInstruction(
       HloInstruction::CreateConstant(Literal::CreateR0<float>(0.0f)));
   builder.AddInstruction(
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index ab60258677..9c59374b4a 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -390,7 +390,7 @@ HloInstruction* HloModule::OutlineExpressionFromComputation(
         // as a parameter in the new function.
         arguments.push_back(old_operand);
         *operand_slot = builder.AddInstruction(HloInstruction::CreateParameter(
-            parameter_count, old_operand->shape(), ""));
+            parameter_count, old_operand->shape(), "p"));
         ++parameter_count;
       }
       TF_CHECK_OK(
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index bf1c7b9323..4aa4406292 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -1148,7 +1148,12 @@ bool HloParser::ParseInstruction(HloComputation::Builder* builder,
                                HloOpcodeString(opcode)));
   }
 
-  instruction->set_name(name);
+  instruction->SetAndSanitizeName(name);
+  if (instruction->name() != name) {
+    return Error(name_loc,
+                 StrCat("illegal instruction name: ", name,
+                        "; suggest renaming to: ", instruction->name()));
+  }
 
   // Add shared attributes like metadata to the instruction, if they were seen.
   if (sharding) {
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 3139801ea3..cccb8f2fbb 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -176,7 +176,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(mul));
   HloInstruction* call = module->OutlineExpressionFromComputation(
-      {add, sub, mul}, "", entry_computation);
+      {add, sub, mul}, "entry", entry_computation);
   EXPECT_EQ(call, entry_computation->root_instruction());
   HloComputation* callee_computation = call->to_apply();
   // The arguments to the call should be const1, const2, and const3.
-- 
GitLab


From 0912bc8cc7f491cdcc5b8a74600292c6e810247b Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Mon, 11 Jun 2018 14:16:30 -0700
Subject: [PATCH 559/610] Fix 'cc_op_gen' to use static storage for constant
 arrays.

Previously, the generate would emit code like this:
  struct Attrs {
    ArraySlice<int> dilations_ = {1, 1, 1, 1};
  };

This code is incorrect, since the array slice references a temporary object
that dies after initialization finishes.

After this change change the generator will produce static functions to
initialize the values:
  struct Attrs {
    ArraySlice<int> dilations_ = Default_dilations();

  private:
    ArraySlice<int> Default_dilations() {
      static int kStorage[] = {1, 1, 1, 1};
      return ArraySlice<int>(kStorage);
    }
  };

Presumably, it used to work because all compilers chose to use static storage
in those cases anyway. However, new versions of clang tend to miscompile this
code, causing test failures. (This error was found when trying to upgrade our
clang revision from r328903 to r331746).

PiperOrigin-RevId: 200110952
---
 tensorflow/cc/framework/cc_op_gen.cc | 71 ++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 10 deletions(-)

diff --git a/tensorflow/cc/framework/cc_op_gen.cc b/tensorflow/cc/framework/cc_op_gen.cc
index d6a4f141b6..dfdef88945 100644
--- a/tensorflow/cc/framework/cc_op_gen.cc
+++ b/tensorflow/cc/framework/cc_op_gen.cc
@@ -273,6 +273,12 @@ string PrintAttrValue(const string& op, const AttrValue& attr_value) {
   return "<Unknown AttrValue type>";  // Prevent missing return warning
 }
 
+bool IsEmptyList(const AttrValue::ListValue& list) {
+  return list.s_size() == 0 && list.i_size() == 0 && list.f_size() == 0 &&
+         list.b_size() == 0 && list.type_size() == 0 &&
+         list.shape_size() == 0 && list.tensor_size() == 0;
+}
+
 string ToCamelCase(const string& str) {
   string result;
   const char joiner = '_';
@@ -297,9 +303,9 @@ string ToCamelCase(const string& str) {
 // indicate whether to treat the type as const when accepting the C++ type as an
 // argument to a function.
 std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
-  static const std::unordered_map<StringPiece, std::pair<const char*, bool>,
-                                  StringPieceHasher>
-      attr_type_map{
+  static const auto* attr_type_map =
+      new std::unordered_map<StringPiece, std::pair<const char*, bool>,
+                             StringPieceHasher>{
           {"string", {"StringPiece", false}},
           {"list(string)", {"gtl::ArraySlice<string>", true}},
           {"int", {"int64", false}},
@@ -317,14 +323,34 @@ std::pair<const char*, bool> AttrTypeName(StringPiece attr_type) {
           {"func", {"NameAttrList", true}},
       };
 
-  auto entry = attr_type_map.find(attr_type);
-  if (entry == attr_type_map.end()) {
+  auto entry = attr_type_map->find(attr_type);
+  if (entry == attr_type_map->end()) {
     LOG(FATAL) << "Unsupported Attr type: " << attr_type;
     return {"", false};
   }
   return entry->second;
 }
 
+const char* ListElementTypeName(StringPiece attr_type) {
+  static const auto* attr_list_type_map =
+      new std::unordered_map<StringPiece, const char*, StringPieceHasher>{
+          {"list(string)", "string"},
+          {"list(int)", "int"},
+          {"list(float)", "float"},
+          {"list(bool)", "bool"},
+          {"list(type)", "DataType"},
+          {"list(shape)", "PartialTensorShape"},
+          {"list(tensor)", "TensorProto"},
+      };
+
+  auto entry = attr_list_type_map->find(attr_type);
+  if (entry == attr_list_type_map->end()) {
+    LOG(FATAL) << "Unsupported or non-list Attr type: " << attr_type;
+    return "";
+  }
+  return entry->second;
+}
+
 bool IsCPPKeyword(StringPiece name) {
   static const std::unordered_set<StringPiece, StringPieceHasher>
       // Keywords obtained from http://en.cppreference.com/w/cpp/keyword
@@ -668,6 +694,7 @@ OpInfo::OpInfo(const OpDef& graph_op_def, const ApiDef& api_def,
 string OpInfo::GetOpAttrStruct() const {
   string struct_fields;
   string setters;
+  string defaults_static_storage;
 
   for (int i = 0; i < graph_op_def.attr_size(); ++i) {
     const auto& attr(graph_op_def.attr(i));
@@ -705,11 +732,32 @@ string OpInfo::GetOpAttrStruct() const {
                        "_ = x;\n");
     strings::StrAppend(&setters, "      return ret;\n    }\n\n");
 
-    strings::StrAppend(
-        &struct_fields, "    ", attr_type_name, " ", api_def_attr.rename_to(),
-        "_ = ",
-        PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()),
-        ";\n");
+    string field_initiliazer;
+    auto& default_value = api_def_attr.default_value();
+    if (default_value.value_case() == AttrValue::kList &&
+        !IsEmptyList(default_value.list())) {
+      // Non-empty lists need static storage for their defaults. Define a
+      // function with static local variable that stores the array.
+      strings::StrAppend(&defaults_static_storage, "    static ",
+                         attr_type_name, " Default_", api_def_attr.rename_to(),
+                         "() {\n");
+      strings::StrAppend(
+          &defaults_static_storage, "      static const ",
+          ListElementTypeName(attr.type()), " kStorage[] = ",
+          PrintAttrValue(graph_op_def.name(), api_def_attr.default_value()),
+          ";\n");
+      strings::StrAppend(&defaults_static_storage, "      return ",
+                         attr_type_name, "(kStorage);\n    }\n");
+      // Set the field_initializer to call the defined function.
+      strings::StrAppend(&field_initiliazer, "Default_",
+                         api_def_attr.rename_to(), "()");
+    } else {
+      field_initiliazer =
+          PrintAttrValue(graph_op_def.name(), api_def_attr.default_value());
+    }
+    strings::StrAppend(&struct_fields, "    ", attr_type_name, " ",
+                       api_def_attr.rename_to(), "_ = ", field_initiliazer,
+                       ";\n");
   }
 
   if (struct_fields.empty()) {
@@ -721,6 +769,9 @@ string OpInfo::GetOpAttrStruct() const {
   string struct_decl = MakeComment(attrs_comment, "  ");
   strings::StrAppend(&struct_decl, "  struct Attrs {\n");
   strings::StrAppend(&struct_decl, setters, struct_fields);
+  if (!defaults_static_storage.empty()) {
+    strings::StrAppend(&struct_decl, "  private:\n", defaults_static_storage);
+  }
   strings::StrAppend(&struct_decl, "  };\n");
 
   return struct_decl;
-- 
GitLab


From 657d601ec735bbe640a3dac3a9b49e77200eafac Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Mon, 11 Jun 2018 14:20:09 -0700
Subject: [PATCH 560/610] [XLA:GPU] Fuse scalar constants

This doesn't change codegen directly, but makes dealing with scalar broadcasts
much easier and the graph easier to read. This required changing the dot *
alpha fusion logic quite a bit, but I think for the better.

The emitter change is a bit of a hack. The more I look at this code the more
broken it seems. Need to find a more sustainable way of emitting what is
essentially a memset.

PiperOrigin-RevId: 200111599
---
 .../xla/service/gpu/instruction_fusion.cc     | 25 ++++++++++-----
 .../service/gpu/instruction_fusion_test.cc    | 31 +++++++++++++++++--
 .../xla/service/gpu/ir_emitter_unnested.cc    | 12 +++++--
 3 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 36a1b82a26..6c4519185b 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -77,15 +77,14 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
   HloInstruction* producer = consumer->mutable_operand(operand_index);
 
   // Check if we can use output fusion for (A @ B) * alpha
-  if (consumer->operand_count() == 2 &&
-      (producer->opcode() == HloOpcode::kDot ||
-       (producer->opcode() == HloOpcode::kFusion &&
-        producer->fused_expression_root()->opcode() == HloOpcode::kDot))) {
+  if (producer->opcode() == HloOpcode::kDot ||
+      (producer->opcode() == HloOpcode::kFusion &&
+       producer->fused_expression_root()->opcode() == HloOpcode::kDot)) {
     int64 other_operand_index = 1 - operand_index;
-    const HloInstruction* alpha = consumer->operand(other_operand_index);
     HloInstruction* op1 = nullptr;
     HloInstruction* op2 = nullptr;
-    if (consumer->opcode() == HloOpcode::kFusion &&
+    if (consumer->operand_count() == 1 &&
+        consumer->opcode() == HloOpcode::kFusion &&
         consumer->fusion_kind() == HloInstruction::FusionKind::kLoop &&
         Match(consumer->fused_expression_root(),
               match::Op()
@@ -103,10 +102,12 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
           op2->opcode() != HloOpcode::kBroadcast) {
         return false;
       }
-      if (IsIEEEFloatingPointScalarConstant(alpha)) {
+      if (IsIEEEFloatingPointScalarConstant(op2->operand(0))) {
         return true;
       }
-    } else if (consumer->opcode() == HloOpcode::kMultiply) {
+    } else if (consumer->operand_count() == 2 &&
+               consumer->opcode() == HloOpcode::kMultiply) {
+      const HloInstruction* alpha = consumer->operand(other_operand_index);
       // Fuse if 'alpha' is a broadcast of a scalar constant.
       if (alpha->opcode() == HloOpcode::kBroadcast &&
           alpha->dimensions().empty() &&
@@ -173,6 +174,14 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
     return false;
   }
 
+  // Fuse scalar constants into loop fusion nodes, this reduces the number of
+  // parameters and makes matching scalar broadcasts easier.
+  if (ShapeUtil::IsEffectiveScalar(producer->shape()) &&
+      consumer->opcode() == HloOpcode::kFusion &&
+      producer->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+
   return IsFusile(*producer) && IsFusile(*consumer) &&
          InstructionFusion::ShouldFuse(consumer, operand_index);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index 426b1d235c..1963d9eef7 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -168,7 +168,7 @@ TEST_F(InstructionFusionTest, BroadcastIntoReduce) {
   HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, op::Fusion());
   EXPECT_THAT(root->fused_expression_root(),
-              op::Reduce(op::Broadcast(op::Parameter()), op::Parameter()));
+              op::Reduce(op::Broadcast(op::Constant()), op::Constant()));
 }
 
 TEST_F(InstructionFusionTest, BitcastIntoAdd) {
@@ -255,7 +255,7 @@ TEST_F(InstructionFusionTest, DotOutputFusion) {
   EXPECT_THAT(
       root->fused_expression_root(),
       op::Multiply(op::Dot(op::Parameter(), op::Transpose(op::Parameter())),
-                   op::Broadcast(op::Parameter())));
+                   op::Broadcast(op::Constant())));
 }
 
 // Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
@@ -339,7 +339,7 @@ TEST_F(InstructionFusionTest, DotOutputFusionImpossible) {
   EXPECT_EQ(root->fusion_kind(), HloInstruction::FusionKind::kLoop);
   EXPECT_THAT(root->fused_expression_root(),
               op::Multiply(op::Multiply(op::Parameter(), op::Parameter()),
-                           op::Broadcast(op::Parameter())));
+                           op::Broadcast(op::Constant())));
 }
 
 // Counts the HLO ops with a given op code in the specified module.
@@ -581,5 +581,30 @@ TEST_F(InstructionFusionTest, FuseIntoInputFusionInstruction) {
       << module->ToString();
 }
 
+TEST_F(InstructionFusionTest, FuseScalarConstant) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+
+  ENTRY FuseScalarConstant {
+    p0 = f32[] parameter(0)
+    c0 = f32[] constant(1)
+    add1 = f32[] add(p0, c0)
+    b0 = f32[2]{0} broadcast(add1), dimensions={}
+    c1 = f32[2]{0} constant({1, 2})
+    ROOT add2 = f32[2]{0} add(b0, c1)
+  })")
+                    .ValueOrDie();
+
+  EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
+                  .Run(module.get())
+                  .ValueOrDie());
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::Fusion());
+  EXPECT_THAT(root->fused_expression_root(),
+              op::Add(op::Broadcast(op::Add(op::Parameter(), op::Constant())),
+                      op::Parameter()));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index a3c1c06cbc..726434c3df 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -2514,7 +2514,9 @@ std::unique_ptr<Thunk> IrEmitterUnnested::BuildGemmThunk(
     if (alpha->opcode() == HloOpcode::kBroadcast) {
       alpha = alpha->operand(0);
     }
-    alpha = inst->operand(alpha->parameter_number());
+    if (alpha->opcode() == HloOpcode::kParameter) {
+      alpha = inst->operand(alpha->parameter_number());
+    }
     // TODO(b/74185543): Remove the following if block once we support fusion
     // with a non-constant as well. Then we will just always use the constant
     // on the device.
@@ -2560,7 +2562,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     const HloInstruction* hlo, const ShapeIndex& index) {
   bool fused = HloOpcode::kFusion == hlo->opcode();
   const HloInstruction* inst = fused ? hlo->fused_expression_root() : hlo;
-  const HloInstruction* init_value = [&] {
+  const HloInstruction* init_value_operand = [&] {
     switch (inst->opcode()) {
       case HloOpcode::kSelectAndScatter:
         return inst->operand(2);
@@ -2580,6 +2582,7 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
     }
   }();
 
+  const HloInstruction* init_value = init_value_operand;
   if (fused && init_value->opcode() == HloOpcode::kParameter) {
     init_value = hlo->operand(init_value->parameter_number());
   }
@@ -2636,6 +2639,11 @@ StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildInitializerThunk(
                                 ir_emitter_context_->device_description());
   UpdateLaunchDimensions(launch_dimensions, kernel_thunk.get(),
                          ir_emitter_context_->llvm_module());
+  // If the init_value was fused into this reduce we have to generate it first.
+  if (fused && init_value_operand->opcode() != HloOpcode::kParameter) {
+    CHECK_EQ(HloOpcode::kConstant, init_value_operand->opcode());
+    TF_RETURN_IF_ERROR(HandleConstant(const_cast<HloInstruction*>(init_value)));
+  }
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(
                          [=](const llvm_ir::IrArray::Index& index) {
                            return GetIrArray(*init_value, *hlo)
-- 
GitLab


From ab51450c817674c8ff08a7ae4f8ac50cdc4bed8b Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 14:37:33 -0700
Subject: [PATCH 561/610] Allow adadelta, adagrad, adam, rmsprop, and
 gradient_descent optimizers take in callable parameters.

PiperOrigin-RevId: 200114810
---
 tensorflow/python/training/adadelta.py        |  17 ++-
 tensorflow/python/training/adadelta_test.py   | 116 +++++++++++-------
 tensorflow/python/training/adagrad.py         |  12 +-
 tensorflow/python/training/adagrad_test.py    |  73 +++++++----
 tensorflow/python/training/adam.py            |  20 ++-
 tensorflow/python/training/adam_test.py       |  18 ++-
 .../python/training/gradient_descent.py       |  15 ++-
 .../python/training/gradient_descent_test.py  |  26 ++++
 tensorflow/python/training/momentum.py        |   4 +-
 tensorflow/python/training/optimizer.py       |   4 +
 tensorflow/python/training/rmsprop.py         |  22 +++-
 tensorflow/python/training/rmsprop_test.py    |  54 +++++++-
 12 files changed, 284 insertions(+), 97 deletions(-)

diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index c08e3cca00..95eca76496 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -46,6 +46,13 @@ class AdadeltaOptimizer(optimizer.Optimizer):
       use_locking: If `True` use locks for update operations.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to "Adadelta".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+    each be a callable that takes no arguments and returns the actual value to
+    use. This can be useful for changing these values across different
+    invocations of optimizer functions.
+    @end_compatibility
     """
     super(AdadeltaOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
@@ -63,9 +70,13 @@ class AdadeltaOptimizer(optimizer.Optimizer):
       self._zeros_slot(v, "accum_update", self._name)
 
   def _prepare(self):
-    self._lr_t = ops.convert_to_tensor(self._lr, name="lr")
-    self._rho_t = ops.convert_to_tensor(self._rho, name="rho")
-    self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
+    lr = self._call_if_callable(self._lr)
+    rho = self._call_if_callable(self._rho)
+    epsilon = self._call_if_callable(self._epsilon)
+
+    self._lr_t = ops.convert_to_tensor(lr, name="lr")
+    self._rho_t = ops.convert_to_tensor(rho, name="rho")
+    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     accum = self.get_slot(var, "accum")
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index 50f435236b..2678016d24 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -32,44 +34,52 @@ from tensorflow.python.training import adadelta
 
 class AdadeltaOptimizerTest(test.TestCase):
 
-  def doTestBasic(self, use_resource=False):
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
     num_updates = 4  # number of ADADELTA steps to perform
     for dtype in [dtypes.half, dtypes.float32]:
       for grad in [0.2, 0.1, 0.01]:
         for lr in [1.0, 0.5, 0.1]:
-          with self.test_session():
-            var0_init = [1.0, 2.0]
-            var1_init = [3.0, 4.0]
-            if use_resource:
-              var0 = resource_variable_ops.ResourceVariable(
-                  var0_init, dtype=dtype)
-              var1 = resource_variable_ops.ResourceVariable(
-                  var1_init, dtype=dtype)
-            else:
-              var0 = variables.Variable(var0_init, dtype=dtype)
-              var1 = variables.Variable(var1_init, dtype=dtype)
-
-            grads = constant_op.constant([grad, grad], dtype=dtype)
-
-            accum = 0.0
-            accum_update = 0.0
-
-            # ADADELTA gradient optimizer
-            rho = 0.95
-            epsilon = 1e-8
-            adadelta_opt = adadelta.AdadeltaOptimizer(lr, rho, epsilon)
+          var0_init = [1.0, 2.0]
+          var1_init = [3.0, 4.0]
+          if use_resource:
+            var0 = resource_variable_ops.ResourceVariable(
+                var0_init, dtype=dtype)
+            var1 = resource_variable_ops.ResourceVariable(
+                var1_init, dtype=dtype)
+          else:
+            var0 = variables.Variable(var0_init, dtype=dtype)
+            var1 = variables.Variable(var1_init, dtype=dtype)
+
+          grads = constant_op.constant([grad, grad], dtype=dtype)
+
+          accum = 0.0
+          accum_update = 0.0
+
+          # ADADELTA gradient optimizer
+          rho = 0.95
+          epsilon = 1e-8
+          if use_callable_params:
+            adadelta_opt = adadelta.AdadeltaOptimizer(
+                learning_rate=lambda: lr,  # pylint: disable=cell-var-from-loop
+                rho=lambda: rho,  # pylint: disable=cell-var-from-loop
+                epsilon=lambda: epsilon)  # pylint: disable=cell-var-from-loop
+          else:
+            adadelta_opt = adadelta.AdadeltaOptimizer(
+                learning_rate=lr, rho=rho, epsilon=epsilon)
+          if not context.executing_eagerly():
             adadelta_update = adadelta_opt.apply_gradients(
                 zip([grads, grads], [var0, var1]))
+            self.evaluate(variables.global_variables_initializer())
 
+            # TODO(lxuechen): This is hard to test in eager mode,
+            # since the optimizer is not fully initialized until the first
+            # call to `apply_gradients`
             opt_vars = adadelta_opt.variables()
             self.assertStartsWith(opt_vars[0].name, var0._shared_name)
             self.assertStartsWith(opt_vars[1].name, var0._shared_name)
             self.assertStartsWith(opt_vars[2].name, var1._shared_name)
             self.assertStartsWith(opt_vars[3].name, var1._shared_name)
             self.assertEqual(4, len(opt_vars))
-
-            variables.global_variables_initializer().run()
-
             # Assign slots
             slot = [None] * 2
             slot_update = [None] * 2
@@ -91,36 +101,42 @@ class AdadeltaOptimizerTest(test.TestCase):
             self.assertEquals(slot_update[1].get_shape(), var1.get_shape())
             self.assertFalse(slot_update[1] in variables.trainable_variables())
 
-            # Fetch params to validate initial values
-            self.assertAllClose(var0_init, var0.eval())
-            self.assertAllClose(var1_init, var1.eval())
-
-            update = [None] * num_updates
-            tot_update = 0
-            for step in range(num_updates):
-              # Run adadelta update for comparison
-              adadelta_update.run()
-
-              # Perform initial update without previous accum values
-              accum = accum * rho + (grad**2) * (1 - rho)
-              update[step] = (np.sqrt(accum_update + epsilon) *
-                              (1. / np.sqrt(accum + epsilon)) * grad)
-              accum_update = (accum_update * rho + (update[step]**2) *
-                              (1.0 - rho))
-              tot_update += update[step] * lr
+          # Fetch params to validate initial values
+          self.assertAllClose(var0_init, self.evaluate(var0))
+          self.assertAllClose(var1_init, self.evaluate(var1))
 
+          update = [None] * num_updates
+          tot_update = 0
+          for step in range(num_updates):
+            # Run adadelta update for comparison
+            if not context.executing_eagerly():
+              self.evaluate(adadelta_update)
+            else:
+              adadelta_opt.apply_gradients(zip([grads, grads], [var0, var1]))
+
+            # Perform initial update without previous accum values
+            accum = accum * rho + (grad**2) * (1 - rho)
+            update[step] = (
+                np.sqrt(accum_update + epsilon) *
+                (1. / np.sqrt(accum + epsilon)) * grad)
+            accum_update = (
+                accum_update * rho + (update[step]**2) * (1.0 - rho))
+            tot_update += update[step] * lr
+
+            if not context.executing_eagerly():
               # Check that the accumulators have been updated
+              # TODO(lxuechen): This is hard to test in eager mode
               for slot_idx in range(2):
                 self.assertAllCloseAccordingToType(
                     np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
-                    slot[slot_idx].eval(),
+                    self.evaluate(slot[slot_idx]),
                     rtol=1e-5)
 
                 self.assertAllCloseAccordingToType(
                     np.array(
                         [accum_update, accum_update],
                         dtype=dtype.as_numpy_dtype()),
-                    slot_update[slot_idx].eval(),
+                    self.evaluate(slot_update[slot_idx]),
                     rtol=1e-5)
 
               # Check that the parameters have been updated
@@ -128,22 +144,28 @@ class AdadeltaOptimizerTest(test.TestCase):
                   np.array(
                       [var0_init[0] - tot_update, var0_init[1] - tot_update],
                       dtype=dtype.as_numpy_dtype()),
-                  var0.eval(),
+                  self.evaluate(var0),
                   rtol=1e-5)
 
               self.assertAllCloseAccordingToType(
                   np.array(
                       [var1_init[0] - tot_update, var1_init[1] - tot_update],
                       dtype=dtype.as_numpy_dtype()),
-                  var1.eval(),
+                  self.evaluate(var1),
                   rtol=1e-5)
 
   def testBasic(self):
-    self.doTestBasic(use_resource=False)
+    with self.test_session():
+      self.doTestBasic(use_resource=False)
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index deb4e6f546..6778f3c735 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -51,6 +51,13 @@ class AdagradOptimizer(optimizer.Optimizer):
 
     Raises:
       ValueError: If the `initial_accumulator_value` is invalid.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate` can be a callable that
+    takes no arguments and returns the actual value to use. This can be useful
+    for changing these values across different invocations of optimizer
+    functions.
+    @end_compatibility
     """
     if initial_accumulator_value <= 0.0:
       raise ValueError("initial_accumulator_value must be positive: %s" %
@@ -78,8 +85,9 @@ class AdagradOptimizer(optimizer.Optimizer):
                                               "accumulator", self._name)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                       name="learning_rate")
+    learning_rate = self._call_if_callable(self._learning_rate)
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        learning_rate, name="learning_rate")
 
   def _apply_dense(self, grad, var):
     acc = self.get_slot(var, "accumulator")
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 15b007b46d..c9aec33d09 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -34,40 +36,63 @@ from tensorflow.python.training import adagrad
 
 class AdagradOptimizerTest(test.TestCase):
 
-  def doTestBasic(self, use_locking=False, use_resource=False):
+  def doTestBasic(self,
+                  use_locking=False,
+                  use_resource=False,
+                  use_callable_params=False):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
-      with self.test_session():
-        if use_resource:
-          var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
-          var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
-        else:
-          var0 = variables.Variable([1.0, 2.0], dtype=dtype)
-          var1 = variables.Variable([3.0, 4.0], dtype=dtype)
-        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
-        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
-        ada_opt = adagrad.AdagradOptimizer(
-            3.0, initial_accumulator_value=0.1, use_locking=use_locking)
+      if use_resource:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+      else:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+      learning_rate = lambda: 3.0
+      if not use_callable_params:
+        learning_rate = learning_rate()
+
+      ada_opt = adagrad.AdagradOptimizer(
+          learning_rate, initial_accumulator_value=0.1, use_locking=use_locking)
+
+      if not context.executing_eagerly():
         ada_update = ada_opt.apply_gradients(
             zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
-        # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
-        # Run 3 steps of adagrad
-        for _ in range(3):
-          ada_update.run()
-        # Validate updated params
-        self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
-        self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+        self.evaluate(variables.global_variables_initializer())
+
+      # Fetch params to validate initial values
+      v0_val, v1_val = self.evaluate([var0, var1])
+      self.assertAllClose([1.0, 2.0], v0_val)
+      self.assertAllClose([3.0, 4.0], v1_val)
+
+      # Run 3 steps of adagrad
+      for _ in range(3):
+        if not context.executing_eagerly():
+          self.evaluate(ada_update)
+        else:
+          ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+      # Validate updated params
+      v0_val, v1_val = self.evaluate([var0, var1])
+      self.assertAllCloseAccordingToType(
+          np.array([-1.6026098728179932, -0.6026098728179932]), v0_val)
+      self.assertAllCloseAccordingToType(
+          np.array([2.715679168701172, 3.715679168701172]), v1_val)
 
   def testBasic(self):
     self.doTestBasic(use_locking=False)
 
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testBasicResource(self):
     self.doTestBasic(use_locking=False, use_resource=True)
 
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(
+          use_locking=False, use_resource=True, use_callable_params=True)
+
   def testBasicLocked(self):
     self.doTestBasic(use_locking=True)
 
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 6fa3ff6658..b65c88e972 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -85,6 +85,13 @@ class AdamOptimizer(optimizer.Optimizer):
       use_locking: If True use locks for update operations.
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `beta1`, `beta2`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(AdamOptimizer, self).__init__(use_locking, name)
     self._lr = learning_rate
@@ -128,10 +135,15 @@ class AdamOptimizer(optimizer.Optimizer):
       self._zeros_slot(v, "v", self._name)
 
   def _prepare(self):
-    self._lr_t = ops.convert_to_tensor(self._lr, name="learning_rate")
-    self._beta1_t = ops.convert_to_tensor(self._beta1, name="beta1")
-    self._beta2_t = ops.convert_to_tensor(self._beta2, name="beta2")
-    self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
+    lr = self._call_if_callable(self._lr)
+    beta1 = self._call_if_callable(self._beta1)
+    beta2 = self._call_if_callable(self._beta2)
+    epsilon = self._call_if_callable(self._epsilon)
+
+    self._lr_t = ops.convert_to_tensor(lr, name="learning_rate")
+    self._beta1_t = ops.convert_to_tensor(beta1, name="beta1")
+    self._beta2_t = ops.convert_to_tensor(beta2, name="beta2")
+    self._epsilon_t = ops.convert_to_tensor(epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     m = self.get_slot(var, "m")
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index bc68f24c6f..ccdc7e384d 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -150,7 +150,7 @@ class AdamOptimizerTest(test.TestCase):
           self.assertAllClose(aggregated_update_var.eval(),
                               repeated_index_update_var.eval())
 
-  def doTestBasic(self, use_resource=False):
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
       with self.test_session(graph=ops.Graph()):
         # Initialize variables for numpy implementation.
@@ -171,7 +171,17 @@ class AdamOptimizerTest(test.TestCase):
         grads0 = constant_op.constant(grads0_np)
         grads1 = constant_op.constant(grads1_np)
 
-        opt = adam.AdamOptimizer()
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam.AdamOptimizer(learning_rate=learning_rate)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         opt_variables = opt.variables()
         beta1_power, beta2_power = opt._get_beta_accumulators()
@@ -221,6 +231,10 @@ class AdamOptimizerTest(test.TestCase):
   def testResourceBasic(self):
     self.doTestBasic(use_resource=True)
 
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index a07ad19a6e..ef50f6315d 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -41,6 +40,13 @@ class GradientDescentOptimizer(optimizer.Optimizer):
       use_locking: If True use locks for update operations.
       name: Optional name prefix for the operations created when applying
         gradients. Defaults to "GradientDescent".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate` can be a callable that
+    takes no arguments and returns the actual value to use. This can be useful
+    for changing these values across different invocations of optimizer
+    functions.
+    @end_compatibility
     """
     super(GradientDescentOptimizer, self).__init__(use_locking, name)
     self._learning_rate = learning_rate
@@ -71,7 +77,6 @@ class GradientDescentOptimizer(optimizer.Optimizer):
     return var.scatter_sub(delta, use_locking=self._use_locking)
 
   def _prepare(self):
-    if not context.executing_eagerly() or not isinstance(
-        self._learning_rate_tensor, ops.EagerTensor):
-      self._learning_rate_tensor = ops.convert_to_tensor(self._learning_rate,
-                                                         name="learning_rate")
+    learning_rate = self._call_if_callable(self._learning_rate)
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        learning_rate, name="learning_rate")
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index f89a9c5838..b304e92421 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -83,6 +83,32 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            var1.eval())
 
+  def testBasicCallableParams(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.test_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lr = lambda: 3.0
+        sgd_op = gradient_descent.GradientDescentOptimizer(lr).apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
+        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           var0.eval())
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           var1.eval())
+
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.test_session():
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index bd9fa79d8f..cb3ec6f053 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -61,8 +61,8 @@ class MomentumOptimizer(optimizer.Optimizer):
         variable(s) track the values called `theta_t + mu*v_t` in the paper.
 
     @compatibility(eager)
-    When eager execution is enabled, learning_rate and momentum can each be a
-    callable that takes no arguments and returns the actual value to use. This
+    When eager execution is enabled, `learning_rate` and `momentum` can each be
+    a callable that takes no arguments and returns the actual value to use. This
     can be useful for changing these values across different invocations of
     optimizer functions.
     @end_compatibility
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index a9287a0f0d..cae29eea93 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -1211,3 +1211,7 @@ class Optimizer(
       self._deferred_slot_restorations.setdefault(
           slot_name, {}).setdefault(variable_key, []).append(
               slot_variable_position)
+
+  def _call_if_callable(self, param):
+    """Call the function if param is callable."""
+    return param() if callable(param) else param
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index 341b970c92..f38c9861d6 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -92,6 +92,13 @@ class RMSPropOptimizer(optimizer.Optimizer):
         computation and memory. Defaults to False.
       name: Optional name prefix for the operations created when applying
         gradients. Defaults to "RMSProp".
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `decay`, `momentum`, and
+    `epsilon` can each be a callable that takes no arguments and returns the
+    actual value to use. This can be useful for changing these values across
+    different invocations of optimizer functions.
+    @end_compatibility
     """
     super(RMSPropOptimizer, self).__init__(use_locking, name)
     self._learning_rate = learning_rate
@@ -120,12 +127,15 @@ class RMSPropOptimizer(optimizer.Optimizer):
       self._zeros_slot(v, "momentum", self._name)
 
   def _prepare(self):
-    self._learning_rate_tensor = ops.convert_to_tensor(
-        self._learning_rate, name="learning_rate")
-    self._decay_tensor = ops.convert_to_tensor(self._decay, name="decay")
-    self._momentum_tensor = ops.convert_to_tensor(
-        self._momentum, name="momentum")
-    self._epsilon_tensor = ops.convert_to_tensor(self._epsilon, name="epsilon")
+    lr = self._call_if_callable(self._learning_rate)
+    decay = self._call_if_callable(self._decay)
+    momentum = self._call_if_callable(self._momentum)
+    epsilon = self._call_if_callable(self._epsilon)
+
+    self._learning_rate_tensor = ops.convert_to_tensor(lr, name="learning_rate")
+    self._decay_tensor = ops.convert_to_tensor(decay, name="decay")
+    self._momentum_tensor = ops.convert_to_tensor(momentum, name="momentum")
+    self._epsilon_tensor = ops.convert_to_tensor(epsilon, name="epsilon")
 
   def _apply_dense(self, grad, var):
     rms = self.get_slot(var, "rms")
diff --git a/tensorflow/python/training/rmsprop_test.py b/tensorflow/python/training/rmsprop_test.py
index ee5385596c..6043327384 100644
--- a/tensorflow/python/training/rmsprop_test.py
+++ b/tensorflow/python/training/rmsprop_test.py
@@ -24,6 +24,7 @@ import math
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -141,7 +142,7 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertAllClose([3.0, 4.0], var1.eval())
 
         # Run 4 steps of RMSProp
-        for t in range(1, 5):
+        for _ in range(1, 5):
           update.run()
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
@@ -261,7 +262,7 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertAllClose([3.0, 4.0], var1.eval())
 
         # Run 4 steps of RMSProp
-        for t in range(1, 5):
+        for _ in range(1, 5):
           update.run()
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
@@ -444,6 +445,55 @@ class RMSPropOptimizerTest(test.TestCase):
                  (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
             ]), var1.eval())
 
+  def testCallableParams(self):
+    with context.eager_mode():
+      for dtype in [dtypes.half, dtypes.float32]:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        learning_rate = lambda: 2.0
+        decay = lambda: 0.9
+        momentum = lambda: 0.0
+        epsilon = lambda: 1.0
+        opt = rmsprop.RMSPropOptimizer(learning_rate, decay, momentum, epsilon)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
+            ]), self.evaluate(var1))
+        # Step 2: the root mean square accumulators contain the previous update.
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
+            ]), self.evaluate(var1))
+
 
 if __name__ == "__main__":
   test.main()
-- 
GitLab


From 3216ba10048efede648054b4a9627ce194aec1d1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 14:55:15 -0700
Subject: [PATCH 562/610] While the DNN is training use that as the logit for
 evaluation.

PiperOrigin-RevId: 200117729
---
 .../estimator_batch/dnn_tree_combined_estimator.py        | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index 758754feac..911d87fa10 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -232,7 +232,13 @@ def _dnn_tree_combined_model_fn(features,
         return update_op
 
   if predict_with_tree_only:
-    tree_train_logits = tree_logits
+    if mode == model_fn.ModeKeys.TRAIN or mode == model_fn.ModeKeys.PREDICT:
+      tree_train_logits = tree_logits
+    else:
+      tree_train_logits = control_flow_ops.cond(
+          global_step > dnn_steps_to_train,
+          lambda: tree_logits,
+          lambda: dnn_logits)
   else:
     tree_train_logits = dnn_logits + tree_logits
 
-- 
GitLab


From f4f92acbcd0994299882260fe4f4896385e6bff9 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Mon, 11 Jun 2018 14:59:42 -0700
Subject: [PATCH 563/610] SpaceToBatchND supports quantization, so make the
 transformation know that.

#19735

PiperOrigin-RevId: 200118450
---
 tensorflow/contrib/lite/toco/graph_transformations/quantize.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index ab24c4f996..d4b5920760 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -51,6 +51,7 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kPadV2 ||
          type == OperatorType::kTensorFlowReshape ||
          type == OperatorType::kTanh || type == OperatorType::kMul ||
+         type == OperatorType::kSpaceToBatchND ||
          type == OperatorType::kSpaceToDepth ||
          type == OperatorType::kStridedSlice ||
          type == OperatorType::kDepthToSpace ||
-- 
GitLab


From dfc2a6bad7d6f8b71bc4fbb65c0373c69f56b7b0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 15:06:49 -0700
Subject: [PATCH 564/610] Make test_locallyconnected_2d_channels_first run in
 graph and eager modes.

PiperOrigin-RevId: 200119934
---
 tensorflow/python/keras/layers/local_test.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 9123d449af..8df3f6b7bd 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -118,6 +118,7 @@ class LocallyConnectedLayersTest(test.TestCase):
             },
             input_shape=(num_samples, num_row, num_col, stack_size))
 
+  @tf_test_util.run_in_graph_and_eager_modes()
   def test_locallyconnected_2d_channels_first(self):
     num_samples = 8
     filters = 3
@@ -125,15 +126,14 @@ class LocallyConnectedLayersTest(test.TestCase):
     num_row = 6
     num_col = 10
 
-    with self.test_session():
-      testing_utils.layer_test(
-          keras.layers.LocallyConnected2D,
-          kwargs={
-              'filters': filters,
-              'kernel_size': 3,
-              'data_format': 'channels_first'
-          },
-          input_shape=(num_samples, num_row, num_col, stack_size))
+    testing_utils.layer_test(
+        keras.layers.LocallyConnected2D,
+        kwargs={
+            'filters': filters,
+            'kernel_size': 3,
+            'data_format': 'channels_first'
+        },
+        input_shape=(num_samples, num_row, num_col, stack_size))
 
   def test_locallyconnected_2d_regularization(self):
     num_samples = 8
-- 
GitLab


From 0472c89ed62a46a2e86d608a30e4e57c09c40da1 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 15:18:36 -0700
Subject: [PATCH 565/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 200122052
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 38 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 |  1 -
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index 8f8c90ee97..b48686d9a3 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -25468,6 +25468,44 @@ op {
     type: "func"
   }
 }
+op {
+  name: "If"
+  input_arg {
+    name: "cond"
+    type_attr: "Tcond"
+  }
+  input_arg {
+    name: "input"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tcond"
+    type: "type"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "then_branch"
+    type: "func"
+  }
+  attr {
+    name: "else_branch"
+    type: "func"
+  }
+}
 op {
   name: "Igamma"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index d3f3e87dfd..dd3a6cd22c 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -12358,7 +12358,6 @@ op {
     name: "Tin"
     type: "list(type)"
     has_minimum: true
-    minimum: 1
   }
   attr {
     name: "Tout"
-- 
GitLab


From b12f58cfcf37cf8f20d3b6c0c7e9fdfb5ec54614 Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Mon, 11 Jun 2018 15:30:58 -0700
Subject: [PATCH 566/610] [tf.data] Improve the error messages for
 `Dataset.from_generator()`.

In particular:
* Improve the error message when the generator yields something with the wrong
  structure.
* Improve the error message when the generator yields something with the wrong
  element type.

PiperOrigin-RevId: 200124096
---
 .../dataset_from_generator_op_test.py         | 32 +++++++++++++++++--
 tensorflow/python/data/ops/dataset_ops.py     | 24 ++++++++++----
 2 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
index 296a76ec88..fb55ae1400 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
@@ -259,9 +259,7 @@ class DatasetConstructorTest(test.TestCase):
       sess.run(init_op)
       self.assertAllEqual([1, 2, 3], sess.run(get_next))
       self.assertAllEqual([4, 5, 6], sess.run(get_next))
-      # NOTE(mrry): Type name in message differs between Python 2 (`long`) and
-      # 3 (`int`).
-      with self.assertRaisesOpError(r"invalid literal for"):
+      with self.assertRaisesOpError("The expected type was int64"):
         sess.run(get_next)
       self.assertAllEqual([7, 8, 9], sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
@@ -290,6 +288,34 @@ class DatasetConstructorTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  def testFromGeneratorStructureError(self):
+    def generator():
+      yield 1, 2
+      yield 3, 4
+      yield 5
+      yield 6, 7, 8
+      yield 9, 10
+
+    iterator = (dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int64, dtypes.int64))
+                .make_initializable_iterator())
+    init_op = iterator.initializer
+    get_next = iterator.get_next()
+
+    with self.test_session() as sess:
+      sess.run(init_op)
+      self.assertEqual((1, 2), sess.run(get_next))
+      self.assertEqual((3, 4), sess.run(get_next))
+      with self.assertRaisesOpError(
+          r"The expected structure was \(tf\.int64, tf\.int64\)"):
+        sess.run(get_next)
+      with self.assertRaisesOpError(
+          r"The expected structure was \(tf\.int64, tf\.int64\)"):
+        sess.run(get_next)
+      self.assertEqual((9, 10), sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
   def testFromGeneratorHeterogeneous(self):
     def generator():
       yield 1
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 672ce014f6..597f92048e 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -409,13 +409,23 @@ class Dataset(object):
         # Use the same _convert function from the py_func() implementation to
         # convert the returned values to arrays early, so that we can inspect
         # their values.
-        # pylint: disable=protected-access
-        ret_arrays = [
-            script_ops.FuncRegistry._convert(ret, dtype=dtype.as_numpy_dtype)
-            for ret, dtype in zip(
-                nest.flatten_up_to(output_types, values), flattened_types)
-        ]
-        # pylint: enable=protected-access
+        try:
+          flattened_values = nest.flatten_up_to(output_types, values)
+        except (TypeError, ValueError):
+          raise TypeError(
+              "`generator` yielded an element that did not match the expected "
+              "structure. The expected structure was %s, but the yielded "
+              "element was %s." % (output_types, values))
+        ret_arrays = []
+        for ret, dtype in zip(flattened_values, flattened_types):
+          try:
+            ret_arrays.append(script_ops.FuncRegistry._convert(  # pylint: disable=protected-access
+                ret, dtype=dtype.as_numpy_dtype))
+          except (TypeError, ValueError):
+            raise TypeError(
+                "`generator` yielded an element that could not be converted to "
+                "the expected type. The expected type was %s, but the yielded "
+                "element was %s." % (dtype.name, ret))
 
         # Additional type and shape checking to ensure that the components
         # of the generated element match the `output_types` and `output_shapes`
-- 
GitLab


From 49ed096fb3f89855dbdccf183d10d8068324f1c2 Mon Sep 17 00:00:00 2001
From: Shanqing Cai <cais@google.com>
Date: Mon, 11 Jun 2018 15:57:39 -0700
Subject: [PATCH 567/610] Improve tfdbg's handling of runtime errors

* In some cases the RuntimeError object (tf_error in cli_shared.py) doesn't have
  the op or its name available. Handle that situation properly.
* Previously, we used client graphs in the debugger CLI whenever it's available. This
  has caused issues in which the device names
  (e.g., "/device:GPU:0" vs "/job:localhost/replica:0/task:0/device:CPU:0").
  This CL fixes that by favoring the runtime graph on the disk over the client graph.
  The former has the actual device names.
  Use the latter only if the former isn't available for some reason (e.g.,
  writing graph to the disk failed.)

PiperOrigin-RevId: 200128582
---
 tensorflow/python/debug/cli/cli_shared.py     | 44 +++++++++++--------
 .../python/debug/cli/cli_shared_test.py       |  5 +++
 .../python/debug/examples/examples_test.sh    |  6 +++
 tensorflow/python/debug/lib/debug_data.py     | 43 +++++++++---------
 4 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/tensorflow/python/debug/cli/cli_shared.py b/tensorflow/python/debug/cli/cli_shared.py
index dea019fef5..6a368682de 100644
--- a/tensorflow/python/debug/cli/cli_shared.py
+++ b/tensorflow/python/debug/cli/cli_shared.py
@@ -451,42 +451,48 @@ def get_error_intro(tf_error):
       sample commands for debugging.
   """
 
-  op_name = tf_error.op.name
+  if hasattr(tf_error, "op") and hasattr(tf_error.op, "name"):
+    op_name = tf_error.op.name
+  else:
+    op_name = None
 
   intro_lines = [
       "--------------------------------------",
       RL("!!! An error occurred during the run !!!", "blink"),
       "",
-      "You may use the following commands to debug:",
   ]
 
   out = debugger_cli_common.rich_text_lines_from_rich_line_list(intro_lines)
 
-  out.extend(
-      _recommend_command("ni -a -d -t %s" % op_name,
-                         "Inspect information about the failing op.",
-                         create_link=True))
-  out.extend(
-      _recommend_command("li -r %s" % op_name,
-                         "List inputs to the failing op, recursively.",
-                         create_link=True))
-
-  out.extend(
-      _recommend_command(
-          "lt",
-          "List all tensors dumped during the failing run() call.",
-          create_link=True))
+  if op_name is not None:
+    out.extend(debugger_cli_common.RichTextLines(
+        ["You may use the following commands to debug:"]))
+    out.extend(
+        _recommend_command("ni -a -d -t %s" % op_name,
+                           "Inspect information about the failing op.",
+                           create_link=True))
+    out.extend(
+        _recommend_command("li -r %s" % op_name,
+                           "List inputs to the failing op, recursively.",
+                           create_link=True))
+
+    out.extend(
+        _recommend_command(
+            "lt",
+            "List all tensors dumped during the failing run() call.",
+            create_link=True))
+  else:
+    out.extend(debugger_cli_common.RichTextLines([
+        "WARNING: Cannot determine the name of the op that caused the error."]))
 
   more_lines = [
       "",
-      "Op name:    " + op_name,
+      "Op name:    %s" % op_name,
       "Error type: " + str(type(tf_error)),
       "",
       "Details:",
       str(tf_error),
       "",
-      "WARNING: Using client GraphDef due to the error, instead of "
-      "executor GraphDefs.",
       "--------------------------------------",
       "",
   ]
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 3d7939490d..07b364db9f 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -372,6 +372,11 @@ class GetErrorIntroTest(test_util.TensorFlowTestCase):
     self.assertEqual("Details:", error_intro.lines[14])
     self.assertStartsWith(error_intro.lines[15], "foo description")
 
+  def testGetErrorIntroForNoOpName(self):
+    tf_error = errors.OpError(None, None, "Fake OpError", -1)
+    error_intro = cli_shared.get_error_intro(tf_error)
+    self.assertIn("Cannot determine the name of the op", error_intro.lines[3])
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/debug/examples/examples_test.sh b/tensorflow/python/debug/examples/examples_test.sh
index 2df6c0b6a2..e9c45a7e6e 100755
--- a/tensorflow/python/debug/examples/examples_test.sh
+++ b/tensorflow/python/debug/examples/examples_test.sh
@@ -69,6 +69,12 @@ run
 exit
 EOF
 
+cat << EOF | ${DEBUG_ERRORS_BIN} --error=uninitialized_variable --debug --ui_type=readline
+run
+ni -a -d -t v/read
+exit
+EOF
+
 cat << EOF | ${DEBUG_MNIST_BIN} --debug --max_steps=1 --fake_data --ui_type=readline
 run -t 1
 run --node_name_filter hidden --op_type_filter MatMul
diff --git a/tensorflow/python/debug/lib/debug_data.py b/tensorflow/python/debug/lib/debug_data.py
index 8a65ad087b..7c96c2878c 100644
--- a/tensorflow/python/debug/lib/debug_data.py
+++ b/tensorflow/python/debug/lib/debug_data.py
@@ -748,7 +748,7 @@ class DebugDumpDir(object):
     return sum(len(self._dump_tensor_data[device_name])
                for device_name in self._dump_tensor_data)
 
-  def _load_partition_graphs(self, partition_graphs, validate):
+  def _load_partition_graphs(self, client_partition_graphs, validate):
     """Load and process partition graphs.
 
     Load the graphs; parse the input and control input structure; obtain the
@@ -757,8 +757,10 @@ class DebugDumpDir(object):
     tensor dumps.
 
     Args:
-      partition_graphs: A repeated field of GraphDefs representing the
-          partition graphs executed by the TensorFlow runtime.
+      client_partition_graphs: A repeated field of GraphDefs representing the
+        partition graphs executed by the TensorFlow runtime, from the Python
+        client. These partition graphs are used only if partition graphs
+        cannot be loaded from the dump directory on the file system.
       validate: (`bool`) Whether the dump files are to be validated against the
         partition graphs.
 
@@ -769,24 +771,23 @@ class DebugDumpDir(object):
     self._debug_graphs = {}
     self._node_devices = {}
 
-    if partition_graphs:
-      partition_graphs_and_device_names = [
-          (partition_graph, None) for partition_graph in partition_graphs]
-    else:
-      partition_graphs_and_device_names = []
-      for device_name in self._device_names:
-        partition_graph = None
-        if device_name in self._dump_graph_file_paths:
-          partition_graph = _load_graph_def_from_event_file(
-              self._dump_graph_file_paths[device_name])
-        else:
-          partition_graph = self._find_partition_graph(partition_graphs,
-                                                       device_name)
-        if partition_graph:
-          partition_graphs_and_device_names.append((partition_graph,
-                                                    device_name))
-        else:
-          logging.warn("Failed to load partition graphs from disk.")
+    partition_graphs_and_device_names = []
+    for device_name in self._device_names:
+      partition_graph = None
+      if device_name in self._dump_graph_file_paths:
+        partition_graph = _load_graph_def_from_event_file(
+            self._dump_graph_file_paths[device_name])
+      else:
+        logging.warn(
+            "Failed to load partition graphs for device %s from disk. "
+            "As a fallback, the client graphs will be used. This "
+            "may cause mismatches in device names." % device_name)
+        partition_graph = self._find_partition_graph(client_partition_graphs,
+                                                     device_name)
+
+      if partition_graph:
+        partition_graphs_and_device_names.append((partition_graph,
+                                                  device_name))
 
     for partition_graph, maybe_device_name in partition_graphs_and_device_names:
       debug_graph = debug_graphs.DebugGraph(partition_graph,
-- 
GitLab


From a1244d61b1bf9db586ad12fb12b65d2db3913e45 Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 11 Jun 2018 16:04:33 -0700
Subject: [PATCH 568/610] Allow silent copies during remote execution.

This is required to do anything useful from python.

PiperOrigin-RevId: 200129777
---
 tensorflow/c/eager/c_api_test.cc              |  81 +++++++-
 .../core/common_runtime/eager/execute.cc      | 191 ++++++++++--------
 2 files changed, 184 insertions(+), 88 deletions(-)

diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 27ff5f7211..992d1afd5f 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -142,8 +142,10 @@ void TestRemoteExecute(bool async) {
   TFE_ContextOptions* opts = TFE_NewContextOptions();
   TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(),
                                  status);
-  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts,
+                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
   TFE_Context* ctx = TFE_NewContext(opts, status);
   EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
   TFE_DeleteContextOptions(opts);
@@ -205,6 +207,83 @@ void TestRemoteExecute(bool async) {
 TEST(CAPI, RemoteExecute) { TestRemoteExecute(false); }
 TEST(CAPI, RemoteExecuteAsync) { TestRemoteExecute(true); }
 
+void TestRemoteExecuteSilentCopies(bool async) {
+  tensorflow::ServerDef server_def = GetServerDef(2);
+
+  // This server def has the task index set to 0.
+  string serialized = server_def.SerializeAsString();
+
+  server_def.set_task_index(1);
+
+  std::unique_ptr<tensorflow::eager::EagerGrpcServer> worker_server;
+  ASSERT_TRUE(
+      tensorflow::eager::EagerGrpcServer::Create(server_def, &worker_server)
+          .ok());
+  ASSERT_TRUE(worker_server->Start().ok());
+
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_ContextOptionsSetServerDef(opts, serialized.data(), serialized.size(),
+                                 status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ContextOptionsSetAsync(opts, static_cast<unsigned char>(1));
+  TFE_ContextOptionsSetDevicePlacementPolicy(opts, TFE_DEVICE_PLACEMENT_SILENT);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* h0_task0 = TestMatrixTensorHandle();
+  TFE_TensorHandle* h1_task0 = TestMatrixTensorHandle();
+  const char remote_device_name[] =
+      "/job:localhost/replica:0/task:1/device:CPU:0";
+
+  // Handles are on task0, but op is on remote (task1).
+  TFE_Op* matmul = MatMulOp(ctx, h0_task0, h1_task0);
+  TFE_OpSetDevice(matmul, remote_device_name, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_TensorHandle* retvals[1];
+  int num_retvals = 1;
+  TFE_Execute(matmul, &retvals[0], &num_retvals, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  auto* retval_task0 = TFE_TensorHandleCopyToDevice(
+      retvals[0], ctx, "/job:localhost/replica:0/task:0/device:CPU:0", status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retval_task0, status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteTensorHandle(retval_task0);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteTensorHandle(h0_task0);
+  TFE_DeleteTensorHandle(h1_task0);
+  TFE_DeleteTensorHandle(retvals[0]);
+
+  TFE_DeleteOp(matmul);
+
+  TFE_ContextAsyncWait(ctx, status);
+  TFE_DeleteContext(ctx, status);
+  EXPECT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_DeleteStatus(status);
+
+  // TODO(nareshmodi): Figure out how to correctly shut the server down.
+  worker_server.release();
+}
+
+TEST(CAPI, RemoteExecuteSilentCopies) { TestRemoteExecuteSilentCopies(false); }
+TEST(CAPI, RemoteExecuteSilentCopiesAsync) {
+  TestRemoteExecuteSilentCopies(true);
+}
+
 TEST(CAPI, TensorHandle) {
   TFE_TensorHandle* h = TestMatrixTensorHandle();
   EXPECT_EQ(TF_FLOAT, TFE_TensorHandleDataType(h));
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index ce989f4b4e..c619857b78 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -66,6 +66,88 @@ int StepStatsDeviceIndex(StepStats* step_stats, EagerContext* ctx,
   return 0;
 }
 
+// This function expects *handle to point to an existing tensor handle. The
+// function will (maybe) update the *handle to be pointed to the newly copied
+// tensor handle.
+//
+// The passed in *handle will be Unreffed if it is replaced.
+Status MaybeCopyInputToExpectedDevice(EagerOperation* op, int i,
+                                      const Device* expected_device,
+                                      RunMetadata* run_metadata,
+                                      TensorHandle** handle) {
+  EagerContext* ctx = op->EagerContext();
+  Device* handle_device = nullptr;
+  TF_RETURN_IF_ERROR((*handle)->Device(&handle_device));
+  const Device* actual_device =
+      handle_device == nullptr ? ctx->HostCPU() : handle_device;
+
+  if (expected_device != actual_device) {
+    switch (ctx->GetDevicePlacementPolicy()) {
+      case DEVICE_PLACEMENT_SILENT_FOR_INT32:
+        // TODO(xpan): See if we could bubble python related error up
+        // to python level.
+        if ((*handle)->dtype == DT_INT32) {
+          // Note: enabling silent copies of int32 tensors to match behavior
+          // of graph mode.
+          break;
+        }
+        TF_FALLTHROUGH_INTENDED;
+      case DEVICE_PLACEMENT_EXPLICIT:
+        return errors::InvalidArgument(
+            "Tensors on conflicting devices:"
+            " cannot compute ",
+            op->Name(), " as input #", i, " was expected to be on ",
+            expected_device->name(), " but is actually on ",
+            actual_device->name(), " (operation running on ",
+            op->Device()->name(), ")",
+            " Tensors can be copied explicitly using .gpu() or .cpu() "
+            "methods,"
+            " or transparently copied by using tf.enable_eager_execution("
+            "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
+            "between devices"
+            " may slow down your model");
+      case DEVICE_PLACEMENT_WARN:
+        LOG(WARNING) << "before computing " << op->Name() << " input #" << i
+                     << " was expected to be on " << expected_device->name()
+                     << " but is actually on " << actual_device->name()
+                     << " (operation running on " << op->Device()->name()
+                     << "). This triggers a copy which can be a performance "
+                        "bottleneck.";
+        break;
+      case DEVICE_PLACEMENT_SILENT:  // Do nothing.
+        break;
+    }
+    // We are only here if the policy is warn or silent copies, so we should
+    // trigger a copy.
+    auto pre_time = Env::Default()->NowMicros();
+    TensorHandle* result_handle;
+    Status status = EagerCopyToDevice(
+        *handle, ctx, expected_device->name().c_str(), &result_handle);
+    if (run_metadata != nullptr) {
+      auto* step_stats = run_metadata->mutable_step_stats();
+      MaybeInitializeStepStats(step_stats, ctx);
+      // Record the sending on the source device for now.
+      int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
+      auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
+      auto* node_stats = dev_stats->add_node_stats();
+      node_stats->set_node_name("_Send");
+      node_stats->set_all_start_micros(pre_time);
+      node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() - pre_time);
+    }
+    if (!status.ok()) {
+      if (result_handle != nullptr) result_handle->Unref();
+      return errors::Internal("Failed copying input tensor from ",
+                              actual_device->name(), " to ",
+                              expected_device->name(), " in order to run ",
+                              op->Name(), ": ", status.error_message());
+    }
+
+    (*handle)->Unref();
+    *handle = result_handle;
+  }
+  return Status::OK();
+}
+
 Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
                                      EagerOperation* op, const OpKernel* kernel,
                                      RunMetadata* run_metadata) {
@@ -78,76 +160,9 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
   for (int i = 0; i < op->Inputs().size(); ++i) {
     const Device* expected_device =
         memtypes[i] == HOST_MEMORY ? host_device : op_device;
-    TensorHandle* handle = op->Inputs()[i];
-    Device* handle_device = nullptr;
-    TF_RETURN_IF_ERROR(handle->Device(&handle_device));
-    const Device* actual_device =
-        handle_device == nullptr ? host_device : handle_device;
-    if (expected_device != actual_device) {
-      switch (ctx->GetDevicePlacementPolicy()) {
-        case DEVICE_PLACEMENT_SILENT_FOR_INT32:
-          // TODO(xpan): See if we could bubble python related error up
-          // to python level.
-          if (handle->dtype == DT_INT32) {
-            // Note: enabling silent copies of int32 tensors to match behavior
-            // of graph mode.
-            break;
-          }
-          TF_FALLTHROUGH_INTENDED;
-        case DEVICE_PLACEMENT_EXPLICIT:
-          return errors::InvalidArgument(
-              "Tensors on conflicting devices:"
-              " cannot compute ",
-              op->Name(), " as input #", i, " was expected to be on ",
-              expected_device->name(), " but is actually on ",
-              actual_device->name(), " (operation running on ",
-              op_device->name(), ")",
-              " Tensors can be copied explicitly using .gpu() or .cpu() "
-              "methods,"
-              " or transparently copied by using tf.enable_eager_execution("
-              "device_policy=tfe.DEVICE_PLACEMENT_SILENT). Copying tensors "
-              "between devices"
-              " may slow down your model");
-        case DEVICE_PLACEMENT_WARN:
-          LOG(WARNING) << "before computing " << op->Name() << " input #" << i
-                       << " was expected to be on " << expected_device->name()
-                       << " but is actually on " << actual_device->name()
-                       << " (operation running on " << op_device->name()
-                       << "). This triggers a copy which can be a performance "
-                          "bottleneck.";
-          break;
-        case DEVICE_PLACEMENT_SILENT:  // Do nothing.
-          break;
-      }
-      // We are only here if the policy is warn or silent copies, so we should
-      // trigger a copy.
-      auto pre_time = Env::Default()->NowMicros();
-      TensorHandle* copied_tensor = nullptr;
-      Status status = EagerCopyToDevice(
-          handle, ctx, expected_device->name().c_str(), &copied_tensor);
-      if (run_metadata != nullptr) {
-        auto* step_stats = run_metadata->mutable_step_stats();
-        MaybeInitializeStepStats(step_stats, ctx);
-        // Record the sending on the source device for now.
-        int device_idx = StepStatsDeviceIndex(step_stats, ctx, handle_device);
-        auto* dev_stats = step_stats->mutable_dev_stats(device_idx);
-        auto* node_stats = dev_stats->add_node_stats();
-        node_stats->set_node_name("_Send");
-        node_stats->set_all_start_micros(pre_time);
-        node_stats->set_op_end_rel_micros(Env::Default()->NowMicros() -
-                                          pre_time);
-      }
-      if (!status.ok()) {
-        if (copied_tensor != nullptr) copied_tensor->Unref();
-        return errors::Internal("Failed copying input tensor from ",
-                                actual_device->name(), " to ",
-                                expected_device->name(), " in order to run ",
-                                op->Name(), ": ", status.error_message());
-      }
-      handle->Unref();
-      handle = copied_tensor;
-      (*op->MutableInputs())[i] = copied_tensor;
-    }
+    TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
+        op, i, expected_device, run_metadata, &((*op->MutableInputs())[i])));
+    tensorflow::TensorHandle* handle = op->Inputs()[i];
     if (handle->dtype != kernel->input_type(i)) {
       return errors::InvalidArgument(
           "cannot compute ", op->Name(), " as input #", i,
@@ -192,8 +207,8 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
 // Resource4> as the input params to the synthesized function.
 //
 // It populates `const_input_types`, `arg_input_types` and
-// `op_input_to_func_input` based on the reordering results, that the caller can
-// use them to build an XlaLaunch. On error, it returns NULL, and sets
+// `op_input_to_func_input` based on the reordering results, that the caller
+// can use them to build an XlaLaunch. On error, it returns NULL, and sets
 // `status` accordingly.
 const FunctionDef* OpToFunction(TFE_Op* op,
                                 std::vector<TF_DataType>* const_input_types,
@@ -221,8 +236,8 @@ const FunctionDef* OpToFunction(TFE_Op* op,
   const std::unordered_set<string> const_inputs(
       *XlaOpRegistry::CompileTimeConstantInputs(op->operation.Name()));
 
-  // First add place holders for the input args, so that we can refer to them by
-  // position in the next loop. Also tally up the resource inputs.
+  // First add place holders for the input args, so that we can refer to them
+  // by position in the next loop. Also tally up the resource inputs.
   int num_resource_inputs = 0;
   for (int i = 0; i < op_def.input_arg_size(); ++i) {
     if (op_def.input_arg(i).type() == DT_RESOURCE) {
@@ -336,8 +351,9 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
                         &op_input_to_func_input, status);
     if (!status.ok()) return nullptr;
   } else {
-    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work for
-    // functions, so we need to find another way to handle constant inputs.
+    // TODO(hongm): XlaOpRegistry::CompileTimeConstantInputs() does not work
+    // for functions, so we need to find another way to handle constant
+    // inputs.
     for (int i = const_input_types.size();
          i < fdef->signature().input_arg_size(); ++i) {
       VLOG(1) << "Adding Targs from input arg " << i;
@@ -348,8 +364,9 @@ std::unique_ptr<TFE_Op> BuildXlaLaunch(TFE_Op* op, TF_Status* status) {
   DCHECK(fdef != nullptr);
 
   // Copy inputs and their devices.
-  // Since input param reordering may have occurred between `op` and `launch_op`
-  // via `op_input_to_func_input`, adjust the actual inputs accordingly.
+  // Since input param reordering may have occurred between `op` and
+  // `launch_op` via `op_input_to_func_input`, adjust the actual inputs
+  // accordingly.
   *launch_op->operation.MutableInputs() = op->operation.Inputs();
   for (TensorHandle* h : launch_op->operation.Inputs()) {
     h->Ref();
@@ -545,24 +562,24 @@ Status EagerLocalExecute(EagerOperation* op,
 Status EagerRemoteExecute(EagerOperation* op, eager::EagerClient* eager_client,
                           uint64 context_id, TensorHandle** retvals,
                           int* num_retvals) {
-  // All tensors must be on the same device.
-  // TODO(nareshmodi): handle silent copies
   eager::EnqueueRequest request;
   eager::EnqueueResponse response;
 
   auto* remote_op = request.add_queue()->mutable_operation();
 
-  for (auto* input : op->Inputs()) {
+  for (int i = 0; i < op->Inputs().size(); i++) {
     tensorflow::Device* input_device;
-    TF_RETURN_IF_ERROR(input->Device(&input_device));
+    TF_RETURN_IF_ERROR(op->Inputs()[i]->Device(&input_device));
     if (op->Device() != input_device) {
-      return tensorflow::errors::InvalidArgument(
-          "Ops and inputs are not on the same device. Use "
-          "TFE_TensorHandleCopyToDevice to get ops on the same "
-          "device. Expected device: ",
-          op->Device()->name(), ", Actual device: ", input_device->name());
+      // TODO(b/110044833): It's possible the same tensor gets copied to the
+      // remote device repeatedly.
+      TF_RETURN_IF_ERROR(MaybeCopyInputToExpectedDevice(
+          op, i, op->Device(), /* run_metadata= */ nullptr,
+          &(*op->MutableInputs())[i]));
     }
 
+    tensorflow::TensorHandle* input = op->Inputs()[i];
+
     tensorflow::uint64 op_id;
     int32 output_num;
     TF_RETURN_IF_ERROR(input->RemoteAddress(&op_id, &output_num));
-- 
GitLab


From 8bf62f8530ed395110dad325b076fd923895fcba Mon Sep 17 00:00:00 2001
From: Akshay Modi <nareshmodi@google.com>
Date: Mon, 11 Jun 2018 16:27:12 -0700
Subject: [PATCH 569/610] Remove memory leak in read variable call, and record
 gradient call.

Fix #19385

PiperOrigin-RevId: 200132949
---
 tensorflow/python/eager/pywrap_tfe_src.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 52b3268903..6c9481c3af 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1873,6 +1873,8 @@ PyObject* RecordGradient(PyObject* op_name, PyObject* inputs, PyObject* attrs,
         delete backward_function;
       });
 
+  Py_DECREF(num_inputs);
+
   Py_RETURN_NONE;
 }
 
@@ -1931,8 +1933,10 @@ bool ReadVariableOp(const FastPathOpExecInfo& parent_op_exec_info,
     Py_INCREF(output->get());  // stay alive after since tuple steals.
     PyTuple_SET_ITEM(outputs.get(), 0, output->get());
 
-    if (!RecordGradient(GetPythonObjectFromString("ReadVariableOp"),
-                        inputs.get(), Py_None, outputs.get(), Py_None)) {
+    tensorflow::Safe_PyObjectPtr op_string(
+        GetPythonObjectFromString("ReadVariableOp"));
+    if (!RecordGradient(op_string.get(), inputs.get(), Py_None, outputs.get(),
+                        Py_None)) {
       return false;
     }
   }
-- 
GitLab


From 95345968a2445c75eaeaa22659b7e574aafe25a7 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 16:41:25 -0700
Subject: [PATCH 570/610] Correct generator path

PiperOrigin-RevId: 200135189
---
 tensorflow/contrib/lite/builtin_ops.h                          | 2 +-
 tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/builtin_ops.h b/tensorflow/contrib/lite/builtin_ops.h
index f3b2ac77fb..aef9a92883 100644
--- a/tensorflow/contrib/lite/builtin_ops.h
+++ b/tensorflow/contrib/lite/builtin_ops.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
 
 // DO NOT EDIT MANUALLY: This file is automatically generated by
-// `schema_builtin_ops_header_generator.py`.
+// `schema/builtin_ops_header/generator.cc`.
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
index 64ab0a9fe2..9dc8daa227 100644
--- a/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
+++ b/tensorflow/contrib/lite/schema/builtin_ops_header/generator.cc
@@ -39,7 +39,7 @@ limitations under the License.
 #define TENSORFLOW_CONTRIB_LITE_BUILTIN_OPS_H_
 
 // DO NOT EDIT MANUALLY: This file is automatically generated by
-// `schema_builtin_ops_header_generator.py`.
+// `schema/builtin_ops_header/generator.cc`.
 
 #ifdef __cplusplus
 extern "C" {
-- 
GitLab


From 734ce1d8e5991c8e7b243b0bab37c074864c0eea Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 16:44:29 -0700
Subject: [PATCH 571/610] Split out HloConstantInstruction and
 HloTraceInstruction as subclasses from HloInstruction.

PiperOrigin-RevId: 200135616
---
 tensorflow/compiler/xla/service/BUILD         |   1 +
 .../compiler/xla/service/hlo_graph_dumper.cc  |  20 +--
 .../compiler/xla/service/hlo_instruction.cc   | 141 +++++-------------
 .../compiler/xla/service/hlo_instruction.h    |  40 ++---
 .../compiler/xla/service/hlo_instructions.cc  | 102 +++++++++++++
 .../compiler/xla/service/hlo_instructions.h   |  56 +++++++
 6 files changed, 224 insertions(+), 136 deletions(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 6f34703fec..6801012cc9 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2574,6 +2574,7 @@ cc_library(
     hdrs = ["hlo_graph_dumper.h"],
     deps = [
         ":hlo",
+        ":hlo_casting_utils",
         ":hlo_execution_profile",
         ":hlo_tfgraph_builder",
         "//tensorflow/compiler/xla:literal_util",
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 05aab9a2cd..28fc6c4209 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -28,6 +28,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -723,17 +725,14 @@ string HloDotDumper::DumpRootTag() {
                 to_id, node_body, node_shape, NodeColorAttributes(color));
 }
 
-static const HloInstruction* TryGetFusionParameterConstant(
+static const HloConstantInstruction* TryGetFusionParameterConstant(
     const HloInstruction* instr) {
   if (instr->opcode() != HloOpcode::kParameter || !instr->IsFused()) {
     return nullptr;
   }
   const HloInstruction* fusion = instr->parent()->FusionInstruction();
   const HloInstruction* operand = fusion->operand(instr->parameter_number());
-  if (operand->opcode() == HloOpcode::kConstant) {
-    return operand;
-  }
-  return nullptr;
+  return DynCast<HloConstantInstruction>(operand);
 }
 
 bool HloDotDumper::ShouldMergeIntoUsers(const HloInstruction* instr) const {
@@ -826,7 +825,7 @@ string HloDotDumper::DumpInstruction(const HloInstruction* instr) {
 
 string HloDotDumper::GetInstructionNodeInlinedOperands(
     const HloInstruction* instr) {
-  auto stringify_constant = [](const HloInstruction* constant) {
+  auto stringify_constant = [](const HloConstantInstruction* constant) {
     const auto& shape = constant->shape();
 
     // If the shape has a dimension of size zero, print it as e.g.
@@ -845,7 +844,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
         *elem_count *= dim;
       }
     }
-    if (elem_count.has_value() && *elem_count <= 8 && constant->HasLiteral()) {
+    if (elem_count.has_value() && *elem_count <= 8) {
       return Printf("%s (%s)", constant->literal().ToString(),
                     ShapeUtil::HumanString(constant->shape()));
     }
@@ -864,9 +863,10 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
   std::vector<string> lines;
   for (int64 i = 0; i < instr->operand_count(); ++i) {
     const HloInstruction* operand = instr->operand(i);
+    const auto* constant_operand = DynCast<HloConstantInstruction>(operand);
     optional<string> operand_str;
-    if (operand->opcode() == HloOpcode::kConstant) {
-      operand_str = stringify_constant(operand);
+    if (constant_operand != nullptr) {
+      operand_str = stringify_constant(constant_operand);
     } else if (ShouldMergeIntoUsers(operand)) {
       // Special case: If the operand is a parameter to a fusion node and it
       // always has a constant value, display it like a regular constant.
@@ -874,7 +874,7 @@ string HloDotDumper::GetInstructionNodeInlinedOperands(
       // For other parameters, use the parameter number rather than the proper
       // name, because that's generally how people think of the node.
       if (operand->opcode() == HloOpcode::kParameter) {
-        if (const HloInstruction* constant =
+        if (const HloConstantInstruction* constant =
                 TryGetFusionParameterConstant(operand)) {
           operand_str = stringify_constant(constant);
         } else {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index c89d836888..9e9bf6361d 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -178,6 +178,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                 slice_limits, slice_strides);
       break;
     }
+    case HloOpcode::kConstant: {
+      CHECK(proto.has_literal());
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          Literal::CreateFromProto(proto.literal()));
+      instruction = CreateConstant(std::move(literal));
+      break;
+    }
+    case HloOpcode::kTrace: {
+      TF_RET_CHECK(proto.operand_ids_size() == 1)
+          << "Trace instruction should have 1 operand but sees "
+          << proto.operand_ids_size();
+      CHECK(proto.has_literal());
+      TF_ASSIGN_OR_RETURN(auto literal,
+                          Literal::CreateFromProto(proto.literal()));
+      instruction = CreateTrace(literal->GetR1U8AsString(), operands(0));
+      break;
+    }
     default: {
       instruction = WrapUnique(new HloInstruction(opcode, proto.shape()));
       for (const int64 operand_id : proto.operand_ids()) {
@@ -223,22 +240,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     instruction->called_computations_.push_back(fused_computation);
   }
 
-  if (instruction->opcode() == HloOpcode::kTrace) {
-    TF_RET_CHECK(instruction->operands().size() == 1)
-        << "Trace instruction should have 1 operand but sees "
-        << instruction->operands().size();
-    instruction->mutable_operand(0)->set_tracing(instruction.get());
-  }
-
   TF_RET_CHECK(!proto.name().empty());
   instruction->SetAndSanitizeName(proto.name());
 
   instruction->metadata_ = proto.metadata();
   instruction->backend_config_ = proto.backend_config();
-  if (proto.has_literal()) {
-    TF_ASSIGN_OR_RETURN(instruction->literal_,
-                        Literal::CreateFromProto(proto.literal()));
-  }
   instruction->parameter_number_ = proto.parameter_number();
 
   instruction->tuple_index_ = proto.tuple_index();
@@ -301,20 +307,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTrace(
     const string& tag, HloInstruction* operand) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()));
-  instruction->operands_.push_back(operand);
-  instruction->literal_ = Literal::CreateR1U8(tag);
-  operand->set_tracing(instruction.get());
-  return instruction;
+  return MakeUnique<HloTraceInstruction>(tag, operand);
 }
 
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateConstant(
     std::unique_ptr<Literal> literal) {
-  auto instruction =
-      WrapUnique(new HloInstruction(HloOpcode::kConstant, literal->shape()));
-  instruction->literal_ = std::move(literal);
-  return instruction;
+  return MakeUnique<HloConstantInstruction>(std::move(literal));
 }
 
 /* static */ std::unique_ptr<HloInstruction>
@@ -1321,6 +1319,8 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kBroadcast:
     case HloOpcode::kMap:
     case HloOpcode::kSlice:
+    case HloOpcode::kConstant:
+    case HloOpcode::kTrace:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -1470,9 +1470,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
       clone =
           CreateWhile(shape, while_condition(), while_body(), new_operands[0]);
       break;
-    case HloOpcode::kConstant:
-      clone = CreateConstant(literal_->CloneToUnique());
-      break;
     case HloOpcode::kFusion: {
       HloModule* module = context != nullptr ? context->module() : GetModule();
       HloComputation* new_fused_computation = nullptr;
@@ -1520,8 +1517,6 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kGenerateToken:
       clone = CreateGenerateToken(new_operands);
       break;
-    case HloOpcode::kTrace:
-      LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode_);
   }
   SetupDerivedInstruction(clone.get());
   clone->set_parent(parent_);
@@ -1602,13 +1597,6 @@ const HloInstruction* HloInstruction::LatestNonGteAncestor() const {
   return hlo;
 }
 
-const Literal& HloInstruction::literal() const {
-  CHECK_EQ(HloOpcode::kConstant, opcode_);
-  return *literal_;
-}
-
-bool HloInstruction::HasLiteral() const { return literal_ != nullptr; }
-
 int64 HloInstruction::tuple_index() const {
   CHECK_EQ(HloOpcode::kGetTupleElement, opcode_);
   return tuple_index_;
@@ -1702,10 +1690,6 @@ void HloInstruction::AddUser(HloInstruction* user) {
   }
 }
 
-bool HloInstruction::IsConstant() const {
-  return opcode_ == HloOpcode::kConstant;
-}
-
 bool HloInstruction::HasConstantOperand() const {
   for (const HloInstruction* operand : operands_) {
     if (operand->IsConstant()) {
@@ -1782,7 +1766,6 @@ bool HloInstruction::IdenticalSlowPath(
     // These opcodes have complex or special behavior so just return false.
     case HloOpcode::kDomain:
     case HloOpcode::kRng:
-    case HloOpcode::kTrace:
     case HloOpcode::kWhile:
     case HloOpcode::kGenerateToken:
       return false;
@@ -1790,10 +1773,6 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kParameter:
       return parameter_number() == other.parameter_number();
 
-    // A constant is defined by the value in the literal.
-    case HloOpcode::kConstant:
-      return literal() == other.literal();
-
     // A reduce-precision operation is determined by the bit sizes.
     case HloOpcode::kReducePrecision:
       return exponent_bits() == other.exponent_bits() &&
@@ -1878,6 +1857,8 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kBroadcast:
     case HloOpcode::kMap:
     case HloOpcode::kSlice:
+    case HloOpcode::kConstant:
+    case HloOpcode::kTrace:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -2172,34 +2153,7 @@ string HloInstruction::OperandsToStringWithCanonicalNameMap(
     const HloPrintOptions& options,
     CanonicalNameMap* canonical_name_map) const {
   string operands;
-  if (opcode() == HloOpcode::kConstant) {
-    // For constants, show the actual value in place of an empty operand list.
-    //
-    // In HloInstruction, sometimes a constant literal is not constructed due
-    // to its size. Skip the printing in this case.
-    if (HasLiteral() && ((!ShapeUtil::IsTuple(shape()) &&
-                          ShapeUtil::ElementsIn(shape()) <= 10) ||
-                         options.print_large_constants())) {
-      // Literal::ToString emits multidimensional arrays over multiple
-      // lines. Compact this into one line by stripping out white space.
-      string tmp = literal().ToString();
-      std::replace(tmp.begin(), tmp.end(), '\n', ' ');
-      std::vector<string> v = tensorflow::str_util::Split(tmp, ' ');
-      bool first = true;
-      // Concatenate elements in "v" with spaces separating them, but ignoring
-      // empty entries.
-      for (const auto& s : v) {
-        if (s.empty()) {
-          continue;
-        }
-        StrAppend(&operands, (first ? "" : " "), s);
-        first = false;
-      }
-    } else {
-      // Do not show large constants or tuples.
-      operands = "{...}";
-    }
-  } else if (opcode() == HloOpcode::kParameter) {
+  if (opcode() == HloOpcode::kParameter) {
     StrAppend(&operands, parameter_number_);
   } else {
     tensorflow::gtl::ArraySlice<HloInstruction*> slice(operands_);
@@ -2410,9 +2364,6 @@ HloInstructionProto HloInstruction::ToProto() const {
 
   *proto.mutable_metadata() = metadata_;
   proto.set_backend_config(backend_config_);
-  if (literal_ != nullptr) {
-    *proto.mutable_literal() = literal_->ToProto();
-  }
   proto.set_parameter_number(parameter_number_);
   if (opcode() == HloOpcode::kFusion) {
     proto.set_fusion_kind(xla::ToString(fusion_kind()));
@@ -2518,12 +2469,6 @@ void HloInstruction::set_tracing(HloInstruction* trace_instruction) {
   trace_instruction_ = trace_instruction;
 }
 
-string HloInstruction::TracingTag() const {
-  CHECK_EQ(HloOpcode::kTrace, opcode());
-  CHECK(literal_ != nullptr);
-  return literal_->GetR1U8AsString();
-}
-
 bool HloInstruction::IsFused() const { return parent_->IsFusionComputation(); }
 
 bool HloInstruction::IsFusable() const {
@@ -3035,10 +2980,6 @@ bool HloInstruction::IsElementwiseBinary() const {
 
 bool HloInstruction::IsElementwise() const {
   switch (opcode_) {
-    // Nullary elementwise operations.
-    case HloOpcode::kConstant:
-      return true;
-
     // Unary elementwise operations.
     case HloOpcode::kAbs:
     case HloOpcode::kRoundNearestAfz:
@@ -3500,23 +3441,6 @@ void HloInstruction::set_outer_dimension_partitions(
   outer_dimension_partitions_ = outer_dimension_partitions;
 }
 
-void HloInstruction::RelayoutConstant(const Layout& new_layout,
-                                      const ShapeIndex& shape_index) {
-  CHECK_EQ(opcode(), HloOpcode::kConstant);
-  Shape* mutable_array_subshape =
-      ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index);
-  CHECK(ShapeUtil::IsArray(*mutable_array_subshape));
-
-  // Normally array_subshape will always have a layout, but this invariant is
-  // temporarily broken in LayoutAssignment::AssignLayouts.
-
-  if (!mutable_array_subshape->has_layout() ||
-      !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) {
-    literal_ = literal_->Relayout(new_layout, shape_index);
-    *mutable_array_subshape->mutable_layout() = new_layout;
-  }
-}
-
 // TODO(b/80131774): Remove these temporary methods after transition.
 int64 HloInstruction::feature_index() const {
   return Cast<HloBatchNormInstruction>(this)->feature_index();
@@ -3574,4 +3498,21 @@ const std::vector<int64>& HloInstruction::slice_strides() const {
 bool HloInstruction::IsInPlaceSlice() const {
   return Cast<HloSliceInstruction>(this)->IsInPlaceSlice();
 }
+
+const Literal& HloInstruction::literal() const {
+  return Cast<HloConstantInstruction>(this)->literal();
+}
+
+bool HloInstruction::IsConstant() const {
+  return DynCast<HloConstantInstruction>(this) != nullptr;
+}
+
+void HloInstruction::RelayoutConstant(const Layout& new_layout,
+                                      const ShapeIndex& shape_index) {
+  Cast<HloConstantInstruction>(this)->RelayoutConstant(new_layout, shape_index);
+}
+
+string HloInstruction::TracingTag() const {
+  return Cast<HloTraceInstruction>(this)->TracingTag();
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index ae1c563b56..05662ef01b 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -875,14 +875,6 @@ class HloInstruction {
   template <typename HloInstructionPtr>
   Status Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor);
 
-  // Returns the literal associated with this instruction.
-  //
-  // Note: only constant and parameter opcodes have an associated literal.
-  const Literal& literal() const;
-
-  // Returns whether there is literal associated with this instruction.
-  bool HasLiteral() const;
-
   // Returns the parameter number associated with this instruction.
   //
   // Note: only parameter opcodes have an associated parameter number.
@@ -1014,14 +1006,6 @@ class HloInstruction {
   string infeed_config() const { return infeed_config_; }
   void set_infeed_config(const string& config) { infeed_config_ = config; }
 
-  // Returns a tag to be used in tracing.
-  //
-  // Precondition: opcode() == HloOpcode::kTrace
-  string TracingTag() const;
-
-  // Returns whether the instruction is a constant.
-  bool IsConstant() const;
-
   // Returns true if this instruction is fused, ie contained within a fusion
   // instruction.
   bool IsFused() const;
@@ -1452,12 +1436,6 @@ class HloInstruction {
   void set_outer_dimension_partitions(
       const std::vector<int64>& outer_dimension_partitions);
 
-  // Change the layout for an Constant Hlo instruction to match new_layout.  For
-  // tuple shaped constants shape_index is the path to the internal array
-  // subshape whose layout needs to be changed.
-  void RelayoutConstant(const Layout& new_layout,
-                        const ShapeIndex& shape_index = {});
-
   // Old methods kept for smooth subclassing transition BEGIN.
   // TODO(b/80131774): Remove this code.
 
@@ -1504,6 +1482,19 @@ class HloInstruction {
 
   // Delegates to HloSliceInstruction::IsInPlaceSlice.
   bool IsInPlaceSlice() const;
+
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const;
+
+  // Returns whether the instruction is a constant.
+  bool IsConstant() const;
+
+  // Delegate to HloConstantInstruction::RelayoutConstant.
+  void RelayoutConstant(const Layout& new_layout,
+                        const ShapeIndex& shape_index = {});
+
+  // Delegates to HloTraceInstruction::TracingTag.
+  string TracingTag() const;
   // Old methods kept for smooth subclassing transition END.
 
  protected:
@@ -1544,7 +1535,7 @@ class HloInstruction {
       CanonicalNameMap* canonical_name_map) const;
 
   // Prints an operand to a string.
-  string OperandsToStringWithCanonicalNameMap(
+  virtual string OperandsToStringWithCanonicalNameMap(
       const HloPrintOptions& options,
       CanonicalNameMap* canonical_name_map) const;
 
@@ -1639,9 +1630,6 @@ class HloInstruction {
   // Result shape of this instruction.
   Shape shape_;
 
-  // Literal, only present for kConstant.
-  std::unique_ptr<Literal> literal_;
-
   // Constant index, only present for kGetTupleElement.
   int64 tuple_index_ = -1;
 
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 56792f8b1b..1815bf1b16 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -20,6 +20,7 @@ limitations under the License.
 namespace xla {
 
 using ::tensorflow::str_util::Join;
+using ::tensorflow::strings::StrAppend;
 using ::tensorflow::strings::StrCat;
 
 HloBatchNormInstruction::HloBatchNormInstruction(
@@ -586,4 +587,105 @@ std::unique_ptr<HloInstruction> HloSliceInstruction::CloneWithNewOperandsImpl(
   return MakeUnique<HloSliceInstruction>(shape, new_operands[0], slice_starts_,
                                          slice_limits_, slice_strides_);
 }
+
+HloConstantInstruction::HloConstantInstruction(std::unique_ptr<Literal> literal)
+    : HloInstruction(HloOpcode::kConstant, CHECK_NOTNULL(literal)->shape()),
+      literal_(std::move(literal)) {}
+
+HloInstructionProto HloConstantInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_literal() = literal_->ToProto();
+  return proto;
+}
+
+bool HloConstantInstruction::IsElementwise() const { return true; }
+
+void HloConstantInstruction::RelayoutConstant(const Layout& new_layout,
+                                              const ShapeIndex& shape_index) {
+  Shape* mutable_array_subshape =
+      ShapeUtil::GetMutableSubshape(mutable_shape(), shape_index);
+  CHECK(ShapeUtil::IsArray(*mutable_array_subshape));
+
+  // Normally array_subshape will always have a layout, but this invariant is
+  // temporarily broken in LayoutAssignment::AssignLayouts.
+
+  if (!mutable_array_subshape->has_layout() ||
+      !LayoutUtil::Equal(mutable_array_subshape->layout(), new_layout)) {
+    literal_ = literal_->Relayout(new_layout, shape_index);
+    *mutable_array_subshape->mutable_layout() = new_layout;
+  }
+}
+
+bool HloConstantInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  const auto& other_slice = static_cast<const HloSliceInstruction&>(other);
+  return literal() == other_slice.literal();
+}
+
+std::unique_ptr<HloInstruction>
+HloConstantInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  return MakeUnique<HloConstantInstruction>(literal_->CloneToUnique());
+}
+
+string HloConstantInstruction::OperandsToStringWithCanonicalNameMap(
+    const HloPrintOptions& options,
+    CanonicalNameMap* canonical_name_map) const {
+  string operands;
+  // For constants, show the actual value in place of an empty operand list.
+  if ((!ShapeUtil::IsTuple(shape()) && ShapeUtil::ElementsIn(shape()) <= 10) ||
+      options.print_large_constants()) {
+    // Literal::ToString emits multidimensional arrays over multiple
+    // lines. Compact this into one line by stripping out white space.
+    string tmp = literal().ToString();
+    std::replace(tmp.begin(), tmp.end(), '\n', ' ');
+    std::vector<string> v = tensorflow::str_util::Split(tmp, ' ');
+    bool first = true;
+    // Concatenate elements in "v" with spaces separating them, but ignoring
+    // empty entries.
+    for (const auto& s : v) {
+      if (s.empty()) {
+        continue;
+      }
+      StrAppend(&operands, (first ? "" : " "), s);
+      first = false;
+    }
+  } else {
+    // Do not show large constants or tuples.
+    operands = "{...}";
+  }
+  return operands;
+}
+
+HloTraceInstruction::HloTraceInstruction(const string& tag,
+                                         HloInstruction* operand)
+    : HloInstruction(HloOpcode::kTrace, ShapeUtil::MakeNil()),
+      literal_(Literal::CreateR1U8(tag)) {
+  AppendOperand(operand);
+  operand->set_tracing(this);
+}
+
+HloInstructionProto HloTraceInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  *proto.mutable_literal() = literal_->ToProto();
+  return proto;
+}
+
+bool HloTraceInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    const std::function<bool(const HloComputation*, const HloComputation*)>&
+        eq_computations) const {
+  return false;
+}
+
+std::unique_ptr<HloInstruction> HloTraceInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape,
+    tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+    HloCloneContext* context) const {
+  LOG(FATAL) << "Not yet implemented, clone: " << HloOpcodeString(opcode());
+}
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index 18e786d8b6..ecd4a31912 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -433,6 +433,62 @@ class HloSliceInstruction : public HloInstruction {
   // Describes whether the slice can be lowered to an offset into the operand.
   bool is_in_place_slice_ = false;
 };
+
+class HloConstantInstruction : public HloInstruction {
+ public:
+  explicit HloConstantInstruction(std::unique_ptr<Literal> literal);
+  // Returns the literal associated with this instruction.
+  const Literal& literal() const { return *literal_; }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+  // Returns true if this instruction is elementwise on all its operands.
+  bool IsElementwise() const override;
+
+  // Change the layout for an Constant Hlo instruction to match new_layout.  For
+  // tuple shaped constants shape_index is the path to the internal array
+  // subshape whose layout needs to be changed.
+  void RelayoutConstant(const Layout& new_layout,
+                        const ShapeIndex& shape_index = {});
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  string OperandsToStringWithCanonicalNameMap(
+      const HloPrintOptions& options,
+      CanonicalNameMap* canonical_name_map) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+  // TODO(b/36360764): Remove unique_ptr wrapping.
+  std::unique_ptr<Literal> literal_;
+};
+
+class HloTraceInstruction : public HloInstruction {
+ public:
+  explicit HloTraceInstruction(const string& tag, HloInstruction* operand);
+  // Returns a tag to be used in tracing.
+  string TracingTag() const { return literal_->GetR1U8AsString(); }
+  // Returns a serialized representation of this instruction.
+  HloInstructionProto ToProto() const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      const std::function<bool(const HloComputation*, const HloComputation*)>&
+          eq_computations) const override;
+  // Implementation for non-common logic of CloneWithNewOperands.
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape,
+      tensorflow::gtl::ArraySlice<HloInstruction*> new_operands,
+      HloCloneContext* context) const override;
+  // TODO(b/36360764): Remove unique_ptr wrapping.
+  std::unique_ptr<Literal> literal_;
+};
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_INSTRUCTIONS_H_
-- 
GitLab


From 4f912021b04f5f82b0d1a6bba5b32a24d7cb9fca Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 16:53:28 -0700
Subject: [PATCH 572/610] Add support for 8bit ResizeBilinear and Slice op to
 tflite and toco

PiperOrigin-RevId: 200136934
---
 .../contrib/lite/kernels/internal/BUILD       |   5 +-
 .../internal/optimized/optimized_ops.h        |  84 +++++++
 .../internal/reference/reference_ops.h        |  37 ++-
 ..._float_test.cc => resize_bilinear_test.cc} |  60 ++++-
 .../contrib/lite/kernels/resize_bilinear.cc   |  23 +-
 .../lite/kernels/resize_bilinear_test.cc      | 235 ++++++++++++++----
 .../graph_transformations/hardcode_min_max.cc |   2 +
 .../toco/graph_transformations/quantize.cc    |   3 +-
 8 files changed, 358 insertions(+), 91 deletions(-)
 rename tensorflow/contrib/lite/kernels/internal/{resize_bilinear_float_test.cc => resize_bilinear_test.cc} (60%)

diff --git a/tensorflow/contrib/lite/kernels/internal/BUILD b/tensorflow/contrib/lite/kernels/internal/BUILD
index 0a5223b235..75298b995d 100644
--- a/tensorflow/contrib/lite/kernels/internal/BUILD
+++ b/tensorflow/contrib/lite/kernels/internal/BUILD
@@ -474,8 +474,9 @@ cc_test(
 )
 
 cc_test(
-    name = "resize_bilinear_float_test",
-    srcs = ["resize_bilinear_float_test.cc"],
+    name = "resize_bilinear_test",
+    srcs = ["resize_bilinear_test.cc"],
+    tags = ["tflite_not_portable"],
     deps = [
         ":optimized_base",
         ":reference_base",
diff --git a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
index d2bee2cd70..8115a072d5 100644
--- a/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/optimized/optimized_ops.h
@@ -5722,6 +5722,46 @@ inline void ResizeBilinearGeneric(const float* input_data,
   }
 }
 
+template <typename T>
+inline void ResizeBilinearGenericSmallChannel(
+    const T* input_data, const Dims<4>& input_dims, T* output_data,
+    const Dims<4>& output_dims, int32 batches, int32 input_height,
+    int32 input_width, int32 depth, int32 output_height, int32 output_width,
+    float height_scale, float width_scale) {
+  memset(output_data, 0,
+         batches * output_height * output_width * depth * sizeof(T));
+
+  T* output_ptr = &output_data[0];
+  for (int b = 0; b < batches; ++b) {
+    for (int y = 0; y < output_height; ++y) {
+      float input_y = y * height_scale;
+      int32 y0 = static_cast<int32>(std::floor(input_y));
+      int32 y1 = std::min(y0 + 1, input_height - 1);
+      for (int x = 0; x < output_width; ++x) {
+        float input_x = x * width_scale;
+        int32 x0 = static_cast<int32>(input_x);
+        int32 x1 = std::min(x0 + 1, input_width - 1);
+
+        int32 input_offset[4] = {
+            Offset(input_dims, 0, x0, y0, b), Offset(input_dims, 0, x1, y0, b),
+            Offset(input_dims, 0, x0, y1, b), Offset(input_dims, 0, x1, y1, b)};
+        float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
+                          (1 - (input_y - y0)) * (input_x - x0),
+                          (input_y - y0) * (1 - (input_x - x0)),
+                          (input_y - y0) * (input_x - x0)};
+
+        for (int d = 0; d < depth; d++) {
+          const T* input_ptr = &input_data[d];
+          *output_ptr++ = static_cast<T>(input_ptr[input_offset[0]] * scale[0] +
+                                         input_ptr[input_offset[1]] * scale[1] +
+                                         input_ptr[input_offset[2]] * scale[2] +
+                                         input_ptr[input_offset[3]] * scale[3]);
+        }
+      }
+    }
+  }
+}
+
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
@@ -5762,6 +5802,41 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
   }
 }
 
+// TODO(prabhumk): This is not a real quantized bilinear. It does not use int8
+// or int16 arithmetic.
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims, bool align_corners) {
+  gemmlowp::ScopedProfilingLabel label("ResizeBilinear");
+  int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
+  int32 input_height = ArraySize(input_dims, 2);
+  int32 input_width = ArraySize(input_dims, 1);
+  int32 depth = MatchingArraySize(input_dims, 0, output_dims, 0);
+
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 3), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 2), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 1), 1);
+  TFLITE_DCHECK_EQ(ArraySize(output_size_dims, 0), 2);
+  int32 output_height = output_size_data[Offset(output_size_dims, 0, 0, 0, 0)];
+  int32 output_width = output_size_data[Offset(output_size_dims, 1, 0, 0, 0)];
+
+  float height_scale =
+      (align_corners && output_height > 1)
+          ? (static_cast<float>(input_height - 1) / (output_height - 1))
+          : (static_cast<float>(input_height) / output_height);
+
+  float width_scale =
+      (align_corners && output_width > 1)
+          ? (static_cast<float>(input_width - 1) / (output_width - 1))
+          : (static_cast<float>(input_width) / output_width);
+
+  ResizeBilinearGenericSmallChannel<uint8>(
+      input_data, input_dims, output_data, output_dims, batches, input_height,
+      input_width, depth, output_height, output_width, height_scale,
+      width_scale);
+}
+
 // legacy, for compatibility with old checked-in code
 inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
@@ -5771,6 +5846,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                  output_data, output_dims, /*align_corners=*/false);
 }
 
+// legacy, for compatibility with old checked-in code
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
+                 output_data, output_dims, /*align_corners=*/false);
+}
+
 template <typename T>
 inline void SpaceToBatchND(const T* input_data, const Dims<4>& input_dims,
                            const int32* block_shape_data,
diff --git a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
index c3f645bdf1..9a3dae5cde 100644
--- a/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/contrib/lite/kernels/internal/reference/reference_ops.h
@@ -3202,9 +3202,10 @@ inline void Gather(const T* input_data, const Dims<4>& input_dims,
   }
 }
 
-inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
+template <typename T>
+inline void ResizeBilinear(const T* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
-                           const Dims<4>& output_size_dims, float* output_data,
+                           const Dims<4>& output_size_dims, T* output_data,
                            const Dims<4>& output_dims, bool align_corners) {
   int32 batches = MatchingArraySize(input_dims, 3, output_dims, 3);
   int32 input_height = ArraySize(input_dims, 2);
@@ -3236,15 +3237,15 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
         int32 x0 = static_cast<int32>(std::floor(input_x));
         int32 x1 = std::min(x0 + 1, input_width - 1);
         for (int c = 0; c < depth; ++c) {
-          float interpolation = input_data[Offset(input_dims, c, x0, y0, b)] *
-                                    (1 - (input_y - y0)) *
-                                    (1 - (input_x - x0)) +
-                                input_data[Offset(input_dims, c, x0, y1, b)] *
-                                    (input_y - y0) * (1 - (input_x - x0)) +
-                                input_data[Offset(input_dims, c, x1, y0, b)] *
-                                    (1 - (input_y - y0)) * (input_x - x0) +
-                                input_data[Offset(input_dims, c, x1, y1, b)] *
-                                    (input_y - y0) * (input_x - x0);
+          T interpolation =
+              static_cast<T>(input_data[Offset(input_dims, c, x0, y0, b)] *
+                                 (1 - (input_y - y0)) * (1 - (input_x - x0)) +
+                             input_data[Offset(input_dims, c, x0, y1, b)] *
+                                 (input_y - y0) * (1 - (input_x - x0)) +
+                             input_data[Offset(input_dims, c, x1, y0, b)] *
+                                 (1 - (input_y - y0)) * (input_x - x0) +
+                             input_data[Offset(input_dims, c, x1, y1, b)] *
+                                 (input_y - y0) * (input_x - x0));
           output_data[Offset(output_dims, c, x, y, b)] = interpolation;
         }
       }
@@ -3257,8 +3258,18 @@ inline void ResizeBilinear(const float* input_data, const Dims<4>& input_dims,
                            const int32* output_size_data,
                            const Dims<4>& output_size_dims, float* output_data,
                            const Dims<4>& output_dims) {
-  ResizeBilinear(input_data, input_dims, output_size_data, output_size_dims,
-                 output_data, output_dims, /*align_corners=*/false);
+  ResizeBilinear<float>(input_data, input_dims, output_size_data,
+                        output_size_dims, output_data, output_dims,
+                        /*align_corners=*/false);
+}
+
+inline void ResizeBilinear(const uint8* input_data, const Dims<4>& input_dims,
+                           const int32* output_size_data,
+                           const Dims<4>& output_size_dims, uint8* output_data,
+                           const Dims<4>& output_dims) {
+  ResizeBilinear<uint8>(input_data, input_dims, output_size_data,
+                        output_size_dims, output_data, output_dims,
+                        /*align_corners=*/false);
 }
 
 template <typename T>
diff --git a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc
similarity index 60%
rename from tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc
rename to tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc
index c1c50dff4d..3d8765f11b 100644
--- a/tensorflow/contrib/lite/kernels/internal/resize_bilinear_float_test.cc
+++ b/tensorflow/contrib/lite/kernels/internal/resize_bilinear_test.cc
@@ -24,9 +24,10 @@ limitations under the License.
 
 namespace tflite {
 namespace {
+template <typename T>
 void TestOneResizeBilinear(int batch, int depth, int input_width,
                            int input_height, int output_width,
-                           int output_height) {
+                           int output_height, float error_threshold) {
   Dims<4> input_dims_inference =
       MakeDimsForInference(depth, input_width, input_height, batch);
   Dims<4> output_dims_inference =
@@ -36,14 +37,15 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
   const int output_buffer_size =
       RequiredBufferSizeForDims(output_dims_inference);
 
-  std::vector<float> input_data(input_buffer_size, 0);
-  std::vector<float> reference_output_data(output_buffer_size, 0);
+  std::vector<T> input_data(input_buffer_size, 0);
+  std::vector<T> reference_output_data(output_buffer_size, 0);
   // Initialize the output data with something other than zero, so we can catch
   // issue with kernels failing to initialize the output.
-  std::vector<float> output_data(output_buffer_size, 3.1415);
+  std::vector<T> output_data(output_buffer_size, 3);
 
-  const float input_amplitude = 1.f;
-  FillRandom(&input_data, -input_amplitude, input_amplitude);
+  const T min_amplitude = static_cast<T>(0);
+  const T max_amplitude = static_cast<T>(255);
+  FillRandom(&input_data, min_amplitude, max_amplitude);
 
   Dims<4> output_size_dims = MakeDimsForInference(2, 1, 1, 1);
   std::vector<int32> output_size_data = {output_height, output_width};
@@ -58,14 +60,46 @@ void TestOneResizeBilinear(int batch, int depth, int input_width,
   double sum_diff = 0;
   float max_abs_val = 0;
   for (int i = 0; i < output_buffer_size; i++) {
-    sum_diff += std::abs(output_data[i] - reference_output_data[i]);
-    max_abs_val = std::max(max_abs_val, std::abs(reference_output_data[i]));
+    sum_diff += std::abs(static_cast<float>(output_data[i]) -
+                         static_cast<float>(reference_output_data[i]));
+    max_abs_val = std::max(
+        max_abs_val, std::abs(static_cast<float>(reference_output_data[i])));
   }
 
   if (sum_diff != 0.f) {
     const float mean_diff = static_cast<float>(sum_diff / output_buffer_size);
     const float relative_error = std::abs(mean_diff) / max_abs_val;
-    ASSERT_LT(relative_error, 1e-5f);
+    ASSERT_LT(relative_error, error_threshold);
+  }
+}
+
+TEST(ResizeBilinear, TestResizeBilinear8Bit) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+
+    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 0.025);
+  }
+}
+
+TEST(ResizeBilinear2x2, TestResizeBilinear8Bit) {
+  const int kTestsToRun = 100 * 1000;
+  for (int i = 0; i < kTestsToRun; i++) {
+    const int batch = ExponentialRandomPositiveInt(0.9f, 3, 20);
+    const int depth = ExponentialRandomPositiveInt(0.9f, 6, 50);
+    const int input_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int input_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
+    const int output_width = input_width * 2;
+    const int output_height = input_height * 2;
+
+    TestOneResizeBilinear<uint8>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
   }
 }
 
@@ -79,8 +113,8 @@ TEST(ResizeBilinear, TestResizeBilinear) {
     const int output_width = ExponentialRandomPositiveInt(0.9f, 20, 200);
     const int output_height = ExponentialRandomPositiveInt(0.9f, 20, 200);
 
-    TestOneResizeBilinear(batch, depth, input_width, input_height, output_width,
-                          output_height);
+    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
   }
 }
 
@@ -94,8 +128,8 @@ TEST(ResizeBilinear2x2, TestResizeBilinear) {
     const int output_width = input_width * 2;
     const int output_height = input_height * 2;
 
-    TestOneResizeBilinear(batch, depth, input_width, input_height, output_width,
-                          output_height);
+    TestOneResizeBilinear<float>(batch, depth, input_width, input_height,
+                                 output_width, output_height, 1e-5);
   }
 }
 }  // namespace
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear.cc b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
index f2092eaa36..86c4cd3ee8 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear.cc
@@ -61,12 +61,10 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(input), 4);
   TF_LITE_ENSURE_EQ(context, NumDimensions(size), 1);
 
-  // TODO(ahentz): Our current implementations only support float32.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
   TF_LITE_ENSURE_EQ(context, size->type, kTfLiteInt32);
   // ResizeBilinear creates a float tensor even when the input is made of
   // integers.
-  output->type = kTfLiteFloat32;
+  output->type = input->type;
 
   if (!IsConstantTensor(size)) {
     SetTensorToDynamic(output);
@@ -90,17 +88,24 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   }
 
   if (output->type == kTfLiteFloat32) {
-#define TF_LITE_RESIZE_BILINEAR(type)                                       \
-  type::ResizeBilinear(GetTensorData<float>(input), GetTensorDims(input),   \
-                       GetTensorData<int32>(size), GetTensorDims(size),     \
-                       GetTensorData<float>(output), GetTensorDims(output), \
+#define TF_LITE_RESIZE_BILINEAR(type, datatype)                                \
+  type::ResizeBilinear(GetTensorData<datatype>(input), GetTensorDims(input),   \
+                       GetTensorData<int32>(size), GetTensorDims(size),        \
+                       GetTensorData<datatype>(output), GetTensorDims(output), \
                        params->align_corners)
 
     if (kernel_type == kReference) {
-      TF_LITE_RESIZE_BILINEAR(reference_ops);
+      TF_LITE_RESIZE_BILINEAR(reference_ops, float);
     }
     if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
-      TF_LITE_RESIZE_BILINEAR(optimized_ops);
+      TF_LITE_RESIZE_BILINEAR(optimized_ops, float);
+    }
+  } else if (output->type == kTfLiteUInt8) {
+    if (kernel_type == kReference) {
+      TF_LITE_RESIZE_BILINEAR(reference_ops, uint8_t);
+    }
+    if (kernel_type == kGenericOptimized || kernel_type == kNeonOptimized) {
+      TF_LITE_RESIZE_BILINEAR(optimized_ops, uint8_t);
     }
 #undef TF_LITE_RESIZE_BILINEAR
   } else {
diff --git a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
index 4e03f3820a..10caffea03 100644
--- a/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/contrib/lite/kernels/resize_bilinear_test.cc
@@ -22,6 +22,7 @@ namespace tflite {
 namespace {
 
 using ::testing::ElementsAreArray;
+using uint8 = std::uint8_t;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
@@ -34,7 +35,7 @@ class ResizeBilinearOpModel : public SingleOpModel {
     } else {
       size_ = AddInput({TensorType_INT32, {2}});
     }
-    output_ = AddOutput(TensorType_FLOAT32);  // Always float.
+    output_ = AddOutput(input.type);
     SetBuiltinOp(BuiltinOperator_RESIZE_BILINEAR,
                  BuiltinOptions_ResizeBilinearOptions,
                  CreateResizeBilinearOptions(builder_).Union());
@@ -45,12 +46,16 @@ class ResizeBilinearOpModel : public SingleOpModel {
     }
   }
 
-  void SetInput(std::initializer_list<float> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
   void SetSize(std::initializer_list<int> data) { PopulateTensor(size_, data); }
 
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
 
  private:
   int input_;
@@ -60,60 +65,121 @@ class ResizeBilinearOpModel : public SingleOpModel {
 
 TEST(ResizeBilinearOpTest, HorizontalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 1, 2, 1}});
-  m.SetInput({3, 6});
+  m.SetInput<float>({3, 6});
   m.SetSize({1, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 
   ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 1, 2, 1}}, {1, 3});
-  const_m.SetInput({3, 6});
+  const_m.SetInput<float>({3, 6});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+}
+
+TEST(ResizeBilinearOpTest, HorizontalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 1, 2, 1}});
+  m.SetInput<uint8>({3, 6});
+  m.SetSize({1, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 1, 2, 1}}, {1, 3});
+  const_m.SetInput<uint8>({3, 6});
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 5, 6})));
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 5, 6})));
 }
 
 TEST(ResizeBilinearOpTest, VerticalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 1, 1}});
-  m.SetInput({3, 9});
+  m.SetInput<float>({3, 9});
   m.SetSize({3, 1});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+  EXPECT_THAT(m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 
   ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 1, 1}}, {3, 1});
-  const_m.SetInput({3, 9});
+  const_m.SetInput<float>({3, 9});
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<float>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+}
+
+TEST(ResizeBilinearOpTest, VerticalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 1, 1}});
+  m.SetInput<uint8>({3, 9});
+  m.SetSize({3, 1});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 1, 1}}, {3, 1});
+  const_m.SetInput<uint8>({3, 9});
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({3, 7, 9})));
+  EXPECT_THAT(const_m.GetOutput<uint8>(),
+              ElementsAreArray(ArrayFloatNear({3, 7, 9})));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}});
-  m.SetInput({
+  m.SetInput<float>({
       3, 6,  //
       9, 12  //
   });
   m.SetSize({3, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 3, 5, 6,    //
-                                 7, 9, 10,   //
-                                 9, 11, 12,  //
-                             })));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
 
   ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 1}}, {3, 3});
-  const_m.SetInput({
+  const_m.SetInput<float>({
       3, 6,  //
       9, 12  //
   });
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                       3, 5, 6,    //
-                                       7, 9, 10,   //
-                                       9, 11, 12,  //
-                                   })));
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,    //
+                                        7, 9, 10,   //
+                                        9, 11, 12,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,  //
+      9, 12  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,    //
+                                              7, 9, 10,   //
+                                              9, 11, 12,  //
+                                          })));
 }
 
 TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {2, 2, 2, 1}});
-  m.SetInput({
+  m.SetInput<float>({
       3, 6,   //
       9, 12,  //
       4, 10,  //
@@ -121,60 +187,123 @@ TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches) {
   });
   m.SetSize({3, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 3, 5, 6,     //
-                                 7, 9, 10,    //
-                                 9, 11, 12,   //
-                                 4, 8, 10,    //
-                                 8, 12, 14,   //
-                                 10, 14, 16,  //
-                             })));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,     //
+                                        7, 9, 10,    //
+                                        9, 11, 12,   //
+                                        4, 8, 10,    //
+                                        8, 12, 14,   //
+                                        10, 14, 16,  //
+                                    })));
 
   ResizeBilinearOpModel const_m({TensorType_FLOAT32, {2, 2, 2, 1}}, {3, 3});
-  const_m.SetInput({
+  const_m.SetInput<float>({
       3, 6,   //
       9, 12,  //
       4, 10,  //
       10, 16  //
   });
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                       3, 5, 6,     //
-                                       7, 9, 10,    //
-                                       9, 11, 12,   //
-                                       4, 8, 10,    //
-                                       8, 12, 14,   //
-                                       10, 14, 16,  //
-                                   })));
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,     //
+                                              7, 9, 10,    //
+                                              9, 11, 12,   //
+                                              4, 8, 10,    //
+                                              8, 12, 14,   //
+                                              10, 14, 16,  //
+                                          })));
 }
 
 TEST(ResizeBilinearOpTest, ThreeDimensionalResize) {
   ResizeBilinearOpModel m({TensorType_FLOAT32, {1, 2, 2, 2}});
-  m.SetInput({
+  m.SetInput<float>({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
   });
   m.SetSize({3, 3});
   m.Invoke();
-  EXPECT_THAT(m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                 3, 4, 5, 8, 6, 10,      //
-                                 7, 8, 9, 12, 10, 14,    //
-                                 9, 10, 11, 14, 12, 16,  //
-                             })));
+  EXPECT_THAT(m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 5, 8, 6, 10,      //
+                                        7, 8, 9, 12, 10, 14,    //
+                                        9, 10, 11, 14, 12, 16,  //
+                                    })));
 
   ResizeBilinearOpModel const_m({TensorType_FLOAT32, {1, 2, 2, 2}}, {3, 3});
-  const_m.SetInput({
+  const_m.SetInput<float>({
       3, 4, 6, 10,    //
       9, 10, 12, 16,  //
   });
   const_m.Invoke();
-  EXPECT_THAT(const_m.GetOutput(), ElementsAreArray(ArrayFloatNear({
-                                       3, 4, 5, 8, 6, 10,      //
-                                       7, 8, 9, 12, 10, 14,    //
-                                       9, 10, 11, 14, 12, 16,  //
-                                   })));
+  EXPECT_THAT(const_m.GetOutput<float>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 5, 8, 6, 10,      //
+                                              7, 8, 9, 12, 10, 14,    //
+                                              9, 10, 11, 14, 12, 16,  //
+                                          })));
+}
+
+TEST(ResizeBilinearOpTest, TwoDimensionalResizeWithTwoBatches8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {2, 2, 2, 1}});
+  m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 5, 6,     //
+                                        7, 9, 10,    //
+                                        9, 11, 12,   //
+                                        4, 8, 10,    //
+                                        8, 12, 14,   //
+                                        10, 13, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {2, 2, 2, 1}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 6,   //
+      9, 12,  //
+      4, 10,  //
+      10, 16  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 5, 6,     //
+                                              7, 9, 10,    //
+                                              9, 11, 12,   //
+                                              4, 8, 10,    //
+                                              8, 12, 14,   //
+                                              10, 13, 16,  //
+                                          })));
 }
 
+TEST(ResizeBilinearOpTest, ThreeDimensionalResize8Bit) {
+  ResizeBilinearOpModel m({TensorType_UINT8, {1, 2, 2, 2}});
+  m.SetInput<uint8>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  m.SetSize({3, 3});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                        3, 4, 5, 8, 6, 10,      //
+                                        7, 8, 9, 12, 10, 14,    //
+                                        9, 10, 11, 13, 12, 16,  //
+                                    })));
+
+  ResizeBilinearOpModel const_m({TensorType_UINT8, {1, 2, 2, 2}}, {3, 3});
+  const_m.SetInput<uint8>({
+      3, 4, 6, 10,    //
+      9, 10, 12, 16,  //
+  });
+  const_m.Invoke();
+  EXPECT_THAT(const_m.GetOutput<uint8>(), ElementsAreArray(ArrayFloatNear({
+                                              3, 4, 5, 8, 6, 10,      //
+                                              7, 8, 9, 12, 10, 14,    //
+                                              9, 10, 11, 13, 12, 16,  //
+                                          })));
+}
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
index d63ee7c951..bda6dce22b 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -362,6 +362,8 @@ bool HardcodeMinMax::Run(Model* model, std::size_t op_index) {
       changed = HardcodeMinMaxForAverageOrMaxPool(model, op);
       break;
 
+    case OperatorType::kResizeBilinear:
+    case OperatorType::kSlice:
     case OperatorType::kStridedSlice:
     case OperatorType::kSqueeze:
     case OperatorType::kTensorFlowReshape:
diff --git a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
index d4b5920760..eca2c701f8 100644
--- a/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/contrib/lite/toco/graph_transformations/quantize.cc
@@ -45,7 +45,8 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kTensorFlowMinimum ||
          type == OperatorType::kTensorFlowMaximum ||
          type == OperatorType::kLogistic || type == OperatorType::kSoftmax ||
-         type == OperatorType::kLogSoftmax ||
+         type == OperatorType::kLogSoftmax || type == OperatorType::kSlice ||
+         type == OperatorType::kResizeBilinear ||
          type == OperatorType::kTensorFlowSplit || type == OperatorType::kSub ||
          type == OperatorType::kSqueeze || type == OperatorType::kPad ||
          type == OperatorType::kPadV2 ||
-- 
GitLab


From 61bcb1e21b304255f6ad1faddb9b4487cc2424d8 Mon Sep 17 00:00:00 2001
From: David Norman <DavidNorman@users.noreply.github.com>
Date: Mon, 11 Jun 2018 17:14:54 -0700
Subject: [PATCH 573/610] [XLA] Allow the tuple simplifier to operate on only
 subcomputations (#19769)

* Allow the tuple simplifier to operate on only subcomputations

* Remove unnecessary trace

* Add a test for the tuple simplifier

Summary: Adding a test for the tuple simplifier following review of public Pull Request

Test Plan: ran this specific test, and all existing poplar tests

Reviewers: jamesn

Reviewed By: jamesn

Differential Revision: https://phabricator.sourcevertex.net/D4548

* Add comment to the parameter in the default constructor

* Correct clang-tidy linting issue
---
 .../compiler/xla/service/tuple_simplifier.cc  |  7 ++
 .../compiler/xla/service/tuple_simplifier.h   |  9 ++-
 .../xla/service/tuple_simplifier_test.cc      | 77 +++++++++++++++++++
 3 files changed, 92 insertions(+), 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.cc b/tensorflow/compiler/xla/service/tuple_simplifier.cc
index e536c8afbf..77bdcc9de0 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.cc
@@ -30,10 +30,17 @@ limitations under the License.
 
 namespace xla {
 
+TupleSimplifier::TupleSimplifier(bool exclude_entry_computation) :
+    exclude_entry_computation_(exclude_entry_computation) {}
+
 StatusOr<bool> TupleSimplifier::Run(HloModule* module) {
   // Initially add all GTE and Tuple instructions to the worklist.
   std::queue<HloInstruction*> worklist;
   for (auto* computation : module->computations()) {
+    if (exclude_entry_computation_ &&
+        computation == module->entry_computation()) {
+      continue;
+    }
     for (auto* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kTuple ||
           instruction->opcode() == HloOpcode::kGetTupleElement) {
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier.h b/tensorflow/compiler/xla/service/tuple_simplifier.h
index e5e9b10b5b..7509501883 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier.h
+++ b/tensorflow/compiler/xla/service/tuple_simplifier.h
@@ -27,13 +27,20 @@ namespace xla {
 // the module.
 class TupleSimplifier : public HloPassInterface {
  public:
-  TupleSimplifier() {}
+  TupleSimplifier() : TupleSimplifier(/*exclude_entry_computation=*/false) {}
+  explicit TupleSimplifier(bool exclude_entry_computation);
   ~TupleSimplifier() override {}
   tensorflow::StringPiece name() const override { return "tuple-simplifier"; }
 
   // Run tuple simplification on the given computation. Returns whether the
   // computation was changed.
   StatusOr<bool> Run(HloModule* module) override;
+
+ private:
+  // When set, this pipeline stage will perform optimization of all computations
+  // apart from the module's entry computation. This is used by Graphcore's
+  // backend.
+  bool exclude_entry_computation_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
index ca9ae91281..d3635eae81 100644
--- a/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_simplifier_test.cc
@@ -42,6 +42,12 @@ class TupleSimplifierTest : public HloTestBase {
     TF_ASSERT_OK(changed_status.status());
     EXPECT_EQ(change_expected, changed_status.ValueOrDie());
   }
+  void Run(HloModule* module, bool change_expected, bool exclude_entry) {
+    TupleSimplifier simplifier(exclude_entry);
+    auto changed_status = simplifier.Run(module);
+    TF_ASSERT_OK(changed_status.status());
+    EXPECT_EQ(change_expected, changed_status.ValueOrDie());
+  }
 
   const Shape scalar_shape_ = ShapeUtil::MakeShape(F32, {});
   const Shape tuple_shape_ = ShapeUtil::MakeTupleShape(
@@ -211,5 +217,76 @@ TEST_F(TupleSimplifierTest, IncompatibleTuples) {
   EXPECT_THAT(computation->root_instruction(), tuple);
 }
 
+TEST_F(TupleSimplifierTest, CanExcludeEntryComputation) {
+  //  Verify that the root computation can be excluded
+  auto module = CreateNewModule();
+
+  HloInstruction* p0;
+  HloInstruction* p1;
+  HloComputation* c0;
+  HloComputation* c1;
+  HloComputation* entry;
+
+  {
+    HloComputation::Builder builder(TestName() + "_1");
+    p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p0, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c0 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_2");
+    p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 1));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, p1, 2));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1, gte2}));
+
+    c1 = module->AddEmbeddedComputation(builder.Build());
+  }
+  {
+    HloComputation::Builder builder(TestName() + "_Entry");
+    HloInstruction* tuple_param = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, tuple_shape_, "param"));
+    HloInstruction* call0 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c0));
+    HloInstruction* call1 = builder.AddInstruction(
+        HloInstruction::CreateCall(tuple_shape_, {tuple_param}, c1));
+    HloInstruction* gte0 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call0, 0));
+    HloInstruction* gte1 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, call1, 1));
+    HloInstruction* tuple0 =
+        builder.AddInstruction(HloInstruction::CreateTuple({gte0, gte1}));
+    HloInstruction* gte2 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 0));
+    HloInstruction* gte3 = builder.AddInstruction(
+        HloInstruction::CreateGetTupleElement(scalar_shape_, tuple0, 1));
+
+    builder.AddInstruction(HloInstruction::CreateTuple({gte2, gte3}));
+
+    entry = module->AddEntryComputation(builder.Build());
+  }
+
+  Run(module.get(), /*change_expected=*/true, /*exclude_entry=*/ true);
+
+  EXPECT_THAT(c0->root_instruction(), p0);
+  EXPECT_THAT(c1->root_instruction(), p1);
+  EXPECT_THAT(entry->instruction_count(), 9);
+}
+
 }  // namespace
 }  // namespace xla
-- 
GitLab


From c8980fd1b4d3a74de0214690f810d0c93da2558f Mon Sep 17 00:00:00 2001
From: Yu-Cheng Ling <ycling@google.com>
Date: Mon, 11 Jun 2018 17:12:31 -0700
Subject: [PATCH 574/610] Minor refactoring - Put together the ops with no
 option structs.

PiperOrigin-RevId: 200139790
---
 tensorflow/contrib/lite/model.cc | 96 +++++++++++++-------------------
 1 file changed, 38 insertions(+), 58 deletions(-)

diff --git a/tensorflow/contrib/lite/model.cc b/tensorflow/contrib/lite/model.cc
index 4fb1ada9fd..039f32b38e 100644
--- a/tensorflow/contrib/lite/model.cc
+++ b/tensorflow/contrib/lite/model.cc
@@ -322,12 +322,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
 
   *builtin_data = nullptr;
   switch (op_type) {
-    case BuiltinOperator_CALL:
-      // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
-      // ok for now, since there is no call implementation either.
-      break;
-    case BuiltinOperator_CUSTOM:
-      break;
     case BuiltinOperator_CONV_2D: {
       TfLiteConvParams* params = MallocPOD<TfLiteConvParams>();
       if (auto* conv_params = op->builtin_options_as_Conv2DOptions()) {
@@ -343,22 +337,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_TANH:
-    case BuiltinOperator_LOGISTIC:
-    case BuiltinOperator_RELU:
-    case BuiltinOperator_RELU_N1_TO_1:
-    case BuiltinOperator_RELU6:
-    case BuiltinOperator_CONCAT_EMBEDDINGS:
-    case BuiltinOperator_EXP:
-    case BuiltinOperator_TOPK_V2:
-    case BuiltinOperator_LOG_SOFTMAX:
-    case BuiltinOperator_DEQUANTIZE:
-    case BuiltinOperator_PRELU:
-    case BuiltinOperator_FLOOR:
-    case BuiltinOperator_NEG:
-    case BuiltinOperator_SIN:
-    case BuiltinOperator_LOG:
-      break;
     case BuiltinOperator_CAST: {
       TfLiteCastParams* params = MallocPOD<TfLiteCastParams>();
       if (auto* schema_params = op->builtin_options_as_CastOptions()) {
@@ -446,9 +424,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_EMBEDDING_LOOKUP:
-      // no-op.
-      break;
     case BuiltinOperator_EMBEDDING_LOOKUP_SPARSE: {
       TfLiteEmbeddingLookupSparseParams* params =
           MallocPOD<TfLiteEmbeddingLookupSparseParams>();
@@ -580,12 +555,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_PAD: {
-      break;
-    }
-    case BuiltinOperator_PADV2: {
-      break;
-    }
     case BuiltinOperator_RESHAPE: {
       auto* params = MallocPOD<TfLiteReshapeParams>();
       if (auto* schema_params = op->builtin_options_as_ReshapeOptions()) {
@@ -625,15 +594,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_SPACE_TO_BATCH_ND: {
-      break;
-    }
-    case BuiltinOperator_BATCH_TO_SPACE_ND: {
-      break;
-    }
-    case BuiltinOperator_TRANSPOSE: {
-      break;
-    }
     case BuiltinOperator_MEAN: {
       auto* params = MallocPOD<TfLiteMeanParams>();
       if (auto* schema_params = op->builtin_options_as_MeanOptions()) {
@@ -673,10 +633,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_MAXIMUM:
-    case BuiltinOperator_MINIMUM: {
-      break;
-    }
     case BuiltinOperator_ARG_MAX: {
       auto* params = MallocPOD<TfLiteArgMaxParams>();
       if (auto* schema_params = op->builtin_options_as_ArgMaxOptions()) {
@@ -686,18 +642,6 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
-    case BuiltinOperator_GREATER:
-    case BuiltinOperator_GREATER_EQUAL:
-    case BuiltinOperator_LESS:
-    case BuiltinOperator_LESS_EQUAL:
-    case BuiltinOperator_EQUAL:
-    case BuiltinOperator_NOT_EQUAL:
-    case BuiltinOperator_SELECT: {
-      break;
-    }
-    case BuiltinOperator_SLICE: {
-      break;
-    }
     case BuiltinOperator_TRANSPOSE_CONV: {
       TfLiteTransposeConvParams* params =
           MallocPOD<TfLiteTransposeConvParams>();
@@ -725,10 +669,46 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       error_reporter->Report("DELEGATE op shouldn't exist in model.");
       return kTfLiteError;
     }
+
+    // Below are the ops with no builtin_data strcture.
+    case BuiltinOperator_BATCH_TO_SPACE_ND:
+    // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
+    // ok for now, since there is no call implementation either.
+    case BuiltinOperator_CALL:
+    case BuiltinOperator_CONCAT_EMBEDDINGS:
+    case BuiltinOperator_CUSTOM:
+    case BuiltinOperator_DEQUANTIZE:
+    case BuiltinOperator_EMBEDDING_LOOKUP:
+    case BuiltinOperator_EQUAL:
+    case BuiltinOperator_EXP:
     case BuiltinOperator_EXPAND_DIMS:
-    case BuiltinOperator_TILE: {
+    case BuiltinOperator_FLOOR:
+    case BuiltinOperator_GREATER:
+    case BuiltinOperator_GREATER_EQUAL:
+    case BuiltinOperator_LESS:
+    case BuiltinOperator_LESS_EQUAL:
+    case BuiltinOperator_LOG:
+    case BuiltinOperator_LOGISTIC:
+    case BuiltinOperator_LOG_SOFTMAX:
+    case BuiltinOperator_MAXIMUM:
+    case BuiltinOperator_MINIMUM:
+    case BuiltinOperator_NEG:
+    case BuiltinOperator_NOT_EQUAL:
+    case BuiltinOperator_PAD:
+    case BuiltinOperator_PADV2:
+    case BuiltinOperator_PRELU:
+    case BuiltinOperator_RELU:
+    case BuiltinOperator_RELU6:
+    case BuiltinOperator_RELU_N1_TO_1:
+    case BuiltinOperator_SELECT:
+    case BuiltinOperator_SIN:
+    case BuiltinOperator_SLICE:
+    case BuiltinOperator_SPACE_TO_BATCH_ND:
+    case BuiltinOperator_TANH:
+    case BuiltinOperator_TILE:
+    case BuiltinOperator_TOPK_V2:
+    case BuiltinOperator_TRANSPOSE:
       break;
-    }
   }
   return kTfLiteOk;
 }
-- 
GitLab


From c169282cfe03e146350d2e17f79be4bf759c4146 Mon Sep 17 00:00:00 2001
From: Clayne Robison <clayne.b.robison@intel.com>
Date: Mon, 11 Jun 2018 17:15:38 -0700
Subject: [PATCH 575/610] [Intel MKL] Remove use of absl::string_view (#19869)

* Remove use of absl::string

* Using tensorflow::StringPiece

* Revert const string& to google formatting style.
---
 tensorflow/core/util/mkl_util.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index 7fc9d69a9f..90b6533690 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -42,6 +42,7 @@ limitations under the License.
 
 #ifndef INTEL_MKL_ML
 #include "mkldnn.hpp"
+#include "tensorflow/core/lib/core/stringpiece.h"
 
 using mkldnn::engine;
 using mkldnn::memory;
@@ -1876,7 +1877,7 @@ class FactoryKeyCreator {
   template <typename T>
   void AddAsKey(const T data) {
     auto buffer = reinterpret_cast<const char *>(&data);
-    Append(absl::string_view(buffer, sizeof(T)));
+    Append(StringPiece(buffer, sizeof(T)));
   }
 
   std::string GetKey() {
@@ -1887,8 +1888,8 @@ class FactoryKeyCreator {
   string key_;
   const char delimiter = 'x';
   const int kMaxKeyLength = 256;
-  void Append(absl::string_view s) {
-    key_.append(string(s));
+  void Append(StringPiece s) {
+    key_.append(s.ToString());
     key_.append(1, delimiter);
   }
 };
-- 
GitLab


From bbee0c4c26d94aa7f0115f984116167052afa11e Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 17:13:19 -0700
Subject: [PATCH 576/610] Checking that TPUEstimator model function features
 have static shapes.

PiperOrigin-RevId: 200139880
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 47 +++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 64ae35dfc5..2521522752 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1343,8 +1343,55 @@ class _ModelFnWrapper(object):
                 key, tensor))
     return predictions
 
+  def _validate_model_features_and_labels(self,
+                                          features,
+                                          labels,
+                                          is_export_mode):
+    """Validates that the features and labels for the model function are valid.
+
+    A valid features/labels object is the one with:
+    - Type: Tensor or a dictionary of Tensors
+    - Static shape if is_export_mode is False.
+
+    Args:
+      features: the features that would be input to the model function.
+      labels: the labels that would be input to the model function.
+      is_export_mode: boolean value specifying if in export mode.
+
+    Raises:
+      TypeError: If features/labels are not of the correct type.
+      ValueError: If features/labels have dynamic shape.
+    """
+
+    def validate(obj, obj_name):
+      """Helper validate function."""
+      if not isinstance(obj, ops.Tensor) and not isinstance(obj, dict):
+        raise TypeError(
+            'The {} to the model returned by input_fn must be either a Tensor '
+            'or a dictionary of Tensors. {}: {}'.format(obj_name, obj_name,
+                                                        obj))
+      if is_export_mode:
+        return
+      if isinstance(obj, ops.Tensor):
+        if not obj.get_shape().is_fully_defined():
+          raise ValueError(
+              'The {} to the model returned by input_fn must have static shape.'
+              ' Tensor: {}'.format(obj_name, obj))
+      else:
+        for (key, tensor) in obj.items():
+          if not tensor.get_shape().is_fully_defined():
+            raise ValueError(
+                'The {} to the model returned by input_fn must have static '
+                'shape. Key: \'{}\', Tensor: {}'.format(
+                    obj_name, key, tensor))
+
+    validate(features, 'features')
+    if labels is not None:
+      validate(labels, 'labels')
+
   def _call_model_fn(self, features, labels, is_export_mode=False):
     """Calls the model_fn with required parameters."""
+    self._validate_model_features_and_labels(features, labels, is_export_mode)
     model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
 
-- 
GitLab


From 5ebfc750447fd100e1b1c3bd747b87f460b50a81 Mon Sep 17 00:00:00 2001
From: Anna R <annarev@google.com>
Date: Mon, 11 Jun 2018 17:21:06 -0700
Subject: [PATCH 577/610] Add module docstrings that have been missing since
 new API generation was added.

PiperOrigin-RevId: 200140810
---
 tensorflow/tools/api/generator/BUILD          | 24 ++++++
 .../tools/api/generator/create_python_api.py  | 52 ++++++++++--
 tensorflow/tools/api/generator/doc_srcs.py    | 65 +++++++++++++++
 .../tools/api/generator/doc_srcs_test.py      | 80 +++++++++++++++++++
 4 files changed, 215 insertions(+), 6 deletions(-)
 create mode 100644 tensorflow/tools/api/generator/doc_srcs.py
 create mode 100644 tensorflow/tools/api/generator/doc_srcs_test.py

diff --git a/tensorflow/tools/api/generator/BUILD b/tensorflow/tools/api/generator/BUILD
index f0c5877a90..3a28153e52 100644
--- a/tensorflow/tools/api/generator/BUILD
+++ b/tensorflow/tools/api/generator/BUILD
@@ -5,12 +5,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow/tools/api/generator:api_gen.bzl", "TENSORFLOW_API_INIT_FILES")
+
+py_library(
+    name = "doc_srcs",
+    srcs = ["doc_srcs.py"],
+    srcs_version = "PY2AND3",
+)
+
 py_binary(
     name = "create_python_api",
     srcs = ["create_python_api.py"],
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
+        ":doc_srcs",
         "//tensorflow/python:no_contrib",
     ],
 )
@@ -24,3 +33,18 @@ py_test(
         "//tensorflow/python:client_testlib",
     ],
 )
+
+py_test(
+    name = "tensorflow_doc_srcs_test",
+    srcs = ["doc_srcs_test.py"],
+    args = [
+        "--package=tensorflow.python",
+    ] + TENSORFLOW_API_INIT_FILES,
+    main = "doc_srcs_test.py",
+    srcs_version = "PY2AND3",
+    deps = [
+        ":doc_srcs",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:no_contrib",
+    ],
+)
diff --git a/tensorflow/tools/api/generator/create_python_api.py b/tensorflow/tools/api/generator/create_python_api.py
index 972bdc84ae..24e3c784d5 100644
--- a/tensorflow/tools/api/generator/create_python_api.py
+++ b/tensorflow/tools/api/generator/create_python_api.py
@@ -26,6 +26,7 @@ import sys
 
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
+from tensorflow.tools.api.generator import doc_srcs
 
 API_ATTRS = tf_export.API_ATTRS
 
@@ -36,10 +37,9 @@ _SYMBOLS_TO_SKIP_EXPLICITLY = {
     # would have side effects.
     'tensorflow.python.platform.flags.FLAGS'
 }
-_GENERATED_FILE_HEADER = """\"\"\"Imports for Python API.
-
-This file is MACHINE GENERATED! Do not edit.
-Generated by: tensorflow/tools/api/generator/create_python_api.py script.
+_GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
+# Generated by: tensorflow/tools/api/generator/create_python_api.py script.
+\"\"\"%s
 \"\"\"
 """
 
@@ -247,6 +247,44 @@ def get_module(dir_path, relative_to_dir):
   return dir_path.replace('/', '.').strip('.')
 
 
+def get_module_docstring(module_name, package):
+  """Get docstring for the given module.
+
+  This method looks for docstring in the following order:
+  1. Checks if module has a docstring specified in doc_srcs.
+  2. Checks if module has a docstring source module specified
+     in doc_srcs. If it does, gets docstring from that module.
+  3. Checks if module with module_name exists under base package.
+     If it does, gets docstring from that module.
+  4. Returns a default docstring.
+
+  Args:
+    module_name: module name relative to tensorflow
+      (excluding 'tensorflow.' prefix) to get a docstring for.
+    package: Base python package containing python with target tf_export
+      decorators.
+
+  Returns:
+    One-line docstring to describe the module.
+  """
+  # Module under base package to get a docstring from.
+  docstring_module_name = module_name
+
+  if module_name in doc_srcs.TENSORFLOW_DOC_SOURCES:
+    docsrc = doc_srcs.TENSORFLOW_DOC_SOURCES[module_name]
+    if docsrc.docstring:
+      return docsrc.docstring
+    if docsrc.docstring_module_name:
+      docstring_module_name = docsrc.docstring_module_name
+
+  docstring_module_name = package + '.' + docstring_module_name
+  if (docstring_module_name in sys.modules and
+      sys.modules[docstring_module_name].__doc__):
+    return sys.modules[docstring_module_name].__doc__
+
+  return 'Public API for tf.%s namespace.' % module_name
+
+
 def create_api_files(
     output_files, package, root_init_template, output_dir, api_name):
   """Creates __init__.py files for the Python API.
@@ -290,7 +328,9 @@ def create_api_files(
       continue
     contents = ''
     if module or not root_init_template:
-      contents = _GENERATED_FILE_HEADER + text
+      contents = (
+          _GENERATED_FILE_HEADER %
+          get_module_docstring(module, package) + text)
     else:
       # Read base init file
       with open(root_init_template, 'r') as root_init_template_file:
@@ -303,7 +343,7 @@ def create_api_files(
     raise ValueError(
         'Missing outputs for python_api_gen genrule:\n%s.'
         'Make sure all required outputs are in the '
-        'tensorflow/tools/api/generator/BUILD file.' %
+        'tensorflow/tools/api/generator/api_gen.bzl file.' %
         ',\n'.join(sorted(missing_output_files)))
 
 
diff --git a/tensorflow/tools/api/generator/doc_srcs.py b/tensorflow/tools/api/generator/doc_srcs.py
new file mode 100644
index 0000000000..74f6db98fd
--- /dev/null
+++ b/tensorflow/tools/api/generator/doc_srcs.py
@@ -0,0 +1,65 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Specifies sources of doc strings for API modules."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+
+# Specifies docstring source for a module.
+# Only one of docstring or docstring_module_name should be set.
+# * If docstring is set, then we will use this docstring when
+#   for the module.
+# * If docstring_module_name is set, then we will copy the docstring
+#   from docstring source module.
+DocSource = collections.namedtuple(
+    'DocSource', ['docstring', 'docstring_module_name'])
+# Each attribute of DocSource is optional.
+DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
+
+TENSORFLOW_DOC_SOURCES = {
+    'app': DocSource(docstring_module_name='platform.app'),
+    'compat': DocSource(docstring_module_name='util.compat'),
+    'distributions': DocSource(
+        docstring_module_name='ops.distributions.distributions'),
+    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
+    'errors': DocSource(docstring_module_name='framework.errors'),
+    'gfile': DocSource(docstring_module_name='platform.gfile'),
+    'graph_util': DocSource(docstring_module_name='framework.graph_util'),
+    'image': DocSource(docstring_module_name='ops.image_ops'),
+    'keras.estimator': DocSource(docstring_module_name='estimator.keras'),
+    'linalg': DocSource(docstring_module_name='ops.linalg_ops'),
+    'logging': DocSource(docstring_module_name='ops.logging_ops'),
+    'losses': DocSource(docstring_module_name='ops.losses.losses'),
+    'manip': DocSource(docstring_module_name='ops.manip_ops'),
+    'math': DocSource(docstring_module_name='ops.math_ops'),
+    'metrics': DocSource(docstring_module_name='ops.metrics'),
+    'nn': DocSource(docstring_module_name='ops.nn_ops'),
+    'nn.rnn_cell': DocSource(docstring_module_name='ops.rnn_cell'),
+    'python_io': DocSource(docstring_module_name='lib.io.python_io'),
+    'resource_loader': DocSource(
+        docstring_module_name='platform.resource_loader'),
+    'sets': DocSource(docstring_module_name='ops.sets'),
+    'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
+    'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
+    'strings': DocSource(docstring_module_name='ops.string_ops'),
+    'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
+    'test': DocSource(docstring_module_name='platform.test'),
+    'train': DocSource(docstring_module_name='training.training'),
+    'train.queue_runner': DocSource(
+        docstring_module_name='training.queue_runner'),
+}
diff --git a/tensorflow/tools/api/generator/doc_srcs_test.py b/tensorflow/tools/api/generator/doc_srcs_test.py
new file mode 100644
index 0000000000..9ba95a3439
--- /dev/null
+++ b/tensorflow/tools/api/generator/doc_srcs_test.py
@@ -0,0 +1,80 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for tensorflow.tools.api.generator.doc_srcs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import importlib
+import sys
+
+from tensorflow.python.platform import test
+from tensorflow.tools.api.generator import doc_srcs
+
+
+FLAGS = None
+
+
+class DocSrcsTest(test.TestCase):
+
+  def testModulesAreValidAPIModules(self):
+    for module_name in doc_srcs.TENSORFLOW_DOC_SOURCES:
+      # Convert module_name to corresponding __init__.py file path.
+      file_path = module_name.replace('.', '/')
+      if file_path:
+        file_path += '/'
+      file_path += '__init__.py'
+
+      if file_path not in FLAGS.outputs:
+        self.assertFalse('%s is not a valid API module' % module_name)
+
+  def testHaveDocstringOrDocstringModule(self):
+    for module_name, docsrc in doc_srcs.TENSORFLOW_DOC_SOURCES.items():
+      if docsrc.docstring and docsrc.docstring_module_name:
+        self.assertFalse(
+            '%s contains DocSource has both a docstring and a '
+            'docstring_module_name. '
+            'Only one of "docstring" or "docstring_module_name" should be set.'
+            % (module_name))
+
+  def testDocstringModulesAreValidModules(self):
+    for _, docsrc in doc_srcs.TENSORFLOW_DOC_SOURCES.items():
+      if docsrc.docstring_module_name:
+        doc_module_name = '.'.join([
+            FLAGS.package, docsrc.docstring_module_name])
+        if doc_module_name not in sys.modules:
+          sys.assertFalse(
+              'docsources_module %s is not a valid module under %s.' %
+              (docsrc.docstring_module_name, FLAGS.package))
+
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser()
+  parser.add_argument(
+      'outputs', metavar='O', type=str, nargs='+',
+      help='create_python_api output files.')
+  parser.add_argument(
+      '--package', type=str,
+      help='Base package that imports modules containing the target tf_export '
+           'decorators.')
+  FLAGS, unparsed = parser.parse_known_args()
+
+  importlib.import_module(FLAGS.package)
+
+  # Now update argv, so that unittest library does not get confused.
+  sys.argv = [sys.argv[0]] + unparsed
+  test.main()
-- 
GitLab


From 8c5d37c3b96cdbcb8a3b657144d4fb63fb3dc100 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 17:24:32 -0700
Subject: [PATCH 578/610] Add `move_dimension` utility to move a single
 dimension within a Tensor.

PiperOrigin-RevId: 200141207
---
 .../kernel_tests/distribution_util_test.py    | 48 +++++++++++
 .../python/ops/distribution_util.py           | 79 +++++++++++++++++++
 2 files changed, 127 insertions(+)

diff --git a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
index 31d24aa9ea..bbbec2103a 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/distribution_util_test.py
@@ -29,7 +29,9 @@ from tensorflow.contrib.distributions.python.ops import mvn_diag
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.distributions import categorical
 from tensorflow.python.ops.distributions import normal
 from tensorflow.python.ops.linalg import linear_operator_diag
@@ -540,5 +542,51 @@ class PadDynamicTest(_PadTest, test.TestCase):
     return False
 
 
+class TestMoveDimension(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_move_dimension_static_shape(self):
+
+    x = random_ops.random_normal(shape=[200, 30, 4, 1, 6])
+
+    x_perm = distribution_util.move_dimension(x, 1, 1)
+    self.assertAllEqual(x_perm.shape.as_list(), [200, 30, 4, 1, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, 3)
+    self.assertAllEqual(x_perm.shape.as_list(), [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, -2)
+    self.assertAllEqual(x_perm.shape.as_list(), [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 4, 2)
+    self.assertAllEqual(x_perm.shape.as_list(), [200, 30, 6, 4, 1])
+
+  @test_util.run_in_graph_and_eager_modes()
+  def test_move_dimension_dynamic_shape(self):
+
+    x_ = random_ops.random_normal(shape=[200, 30, 4, 1, 6])
+    x = array_ops.placeholder_with_default(input=x_, shape=None)
+
+    x_perm = distribution_util.move_dimension(x, 1, 1)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [200, 30, 4, 1, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, 3)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 0, -2)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [30, 4, 1, 200, 6])
+
+    x_perm = distribution_util.move_dimension(x, 4, 2)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [200, 30, 6, 4, 1])
+
+    x_perm = distribution_util.move_dimension(x, -1, 2)
+    self.assertAllEqual(self.evaluate(array_ops.shape(x_perm)),
+                        [200, 30, 6, 4, 1])
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distributions/python/ops/distribution_util.py b/tensorflow/contrib/distributions/python/ops/distribution_util.py
index 289e1d50e1..6959b3e877 100644
--- a/tensorflow/contrib/distributions/python/ops/distribution_util.py
+++ b/tensorflow/contrib/distributions/python/ops/distribution_util.py
@@ -21,12 +21,19 @@ from __future__ import print_function
 from tensorflow.contrib import linalg
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import distribution as distribution_lib
+
+# The following two lines are redundant, in a sense. The first enables
+# good coding practice  *within* this file (`util.prefer_static_value`
+# rather than  `prefer_static_value`). The  second ensures  that users
+# also get the core utils when they import this file.
+from tensorflow.python.ops.distributions import util
 from tensorflow.python.ops.distributions.util import *  # pylint: disable=wildcard-import
 
 
@@ -484,3 +491,75 @@ def pad_mixture_dimensions(x, mixture_distribution, categorical_distribution,
 def static_value(x):
   """Returns the static value of a `Tensor` or `None`."""
   return tensor_util.constant_value(ops.convert_to_tensor(x))
+
+
+def move_dimension(x, source_idx, dest_idx):
+  """Move a single tensor dimension within its shape.
+
+  This is a special case of `tf.transpose()`, which applies
+  arbitrary permutations to tensor dimensions.
+
+  Args:
+    x: Tensor of rank `ndims`.
+    source_idx: Integer index into `x.shape` (negative indexing is
+      supported).
+    dest_idx: Integer index into `x.shape` (negative indexing is
+      supported).
+
+  Returns:
+    x_perm: Tensor of rank `ndims`, in which the dimension at original
+     index `source_idx` has been moved to new index `dest_idx`, with
+     all other dimensions retained in their original order.
+
+  Example:
+
+  ```python
+  x = tf.placeholder(shape=[200, 30, 4, 1, 6])
+  x_perm = _move_dimension(x, 1, 1) # no-op
+  x_perm = _move_dimension(x, 0, 3) # result shape [30, 4, 1, 200, 6]
+  x_perm = _move_dimension(x, 0, -2) # equivalent to previous
+  x_perm = _move_dimension(x, 4, 2) # result shape [200, 30, 6, 4, 1]
+  ```
+  """
+  ndims = util.prefer_static_rank(x)
+  if isinstance(source_idx, int):
+    dtype = dtypes.int32
+  else:
+    dtype = dtypes.as_dtype(source_idx.dtype)
+
+  # Handle negative indexing. Since ndims might be dynamic, this makes
+  # source_idx and dest_idx also possibly dynamic.
+  if source_idx < 0:
+    source_idx = ndims + source_idx
+  if dest_idx < 0:
+    dest_idx = ndims + dest_idx
+
+  # Construct the appropriate permutation of dimensions, depending
+  # whether the source is before or after the destination.
+  def move_left_permutation():
+    return util.prefer_static_value(
+        array_ops.concat([
+            math_ops.range(0, dest_idx, dtype=dtype),
+            [source_idx],
+            math_ops.range(dest_idx, source_idx, dtype=dtype),
+            math_ops.range(source_idx+1, ndims, dtype=dtype)], axis=0))
+
+  def move_right_permutation():
+    return util.prefer_static_value(
+        array_ops.concat([
+            math_ops.range(0, source_idx, dtype=dtype),
+            math_ops.range(source_idx+1, dest_idx+1, dtype=dtype),
+            [source_idx],
+            math_ops.range(dest_idx+1, ndims, dtype=dtype)], axis=0))
+
+  def x_permuted():
+    return array_ops.transpose(
+        x, perm=smart_cond.smart_cond(source_idx < dest_idx,
+                                      move_right_permutation,
+                                      move_left_permutation))
+
+  # One final conditional to handle the special case where source
+  # and destination indices are equal.
+  return smart_cond.smart_cond(math_ops.equal(source_idx, dest_idx),
+                               lambda: x,
+                               x_permuted)
-- 
GitLab


From b5fa781337ad8becaab893d001b04f2b995575b5 Mon Sep 17 00:00:00 2001
From: Suharsh Sivakumar <suharshs@google.com>
Date: Mon, 11 Jun 2018 18:41:48 -0700
Subject: [PATCH 579/610] TFLite should allow values of 0 for
 default_ranges_{min,max}.

PiperOrigin-RevId: 200149066
---
 tensorflow/contrib/lite/python/tflite_convert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index 32ad84ec3c..f497533bed 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -116,7 +116,8 @@ def _convert_model(flags):
                        "tensors in order to map between names and "
                        "values.".format(",".join(input_arrays)))
     converter.quantized_input_stats = dict(zip(input_arrays, quant_stats))
-  if flags.default_ranges_min and flags.default_ranges_max:
+  if (flags.default_ranges_min is not None) and (flags.default_ranges_max is
+                                                 not None):
     converter.default_ranges_stats = (flags.default_ranges_min,
                                       flags.default_ranges_max)
 
@@ -195,7 +196,7 @@ def _check_flags(flags, unparsed):
       raise ValueError("--std_dev_values, --mean_values must have the same "
                        "number of items")
 
-  if bool(flags.default_ranges_min) != bool(flags.default_ranges_max):
+  if (flags.default_ranges_min is None) != (flags.default_ranges_max is None):
     raise ValueError("--default_ranges_min and --default_ranges_max must be "
                      "used together")
 
-- 
GitLab


From 5f4be37bebe0343736e800884387cc2147bc55cb Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 19:09:42 -0700
Subject: [PATCH 580/610] Re-enable trainer TPU test.

PiperOrigin-RevId: 200151330
---
 .../compiler/xla/service/hlo_module_group_metadata.cc    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
index 4f1715e4ca..bf33640db1 100644
--- a/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_group_metadata.cc
@@ -127,9 +127,14 @@ Status HloModuleGroupMetadata::VerifyCompanionSets() const {
     for (HloInstruction* instruction : *companions) {
       // Go through all the communicating instructions (send, recv) of the given
       // companion, and record their device.
+      auto it = tracked_instructions_comms_.find(instruction);
+      if (it == tracked_instructions_comms_.end()) {
+        // Companions can be added even if they have no communicating
+        // instructions, if they are parent of companions.
+        continue;
+      }
       std::unordered_set<int64> comm_devices;
-      for (HloInstruction* comm_instruction :
-           tracked_instructions_comms_.at(instruction)) {
+      for (HloInstruction* comm_instruction : it->second) {
         auto device = GetInstructionDevice(*comm_instruction);
         TF_RET_CHECK(device) << "Instruction " << comm_instruction->ToString()
                              << " does not have a device";
-- 
GitLab


From 39c18ead40f4b998b857d07629317675fbf5d035 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 19:45:19 -0700
Subject: [PATCH 581/610] Use activation in MUL and ADD operations

PiperOrigin-RevId: 200153612
---
 tensorflow/contrib/lite/nnapi_delegate.cc | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/tensorflow/contrib/lite/nnapi_delegate.cc b/tensorflow/contrib/lite/nnapi_delegate.cc
index 99cb40e967..999c31d4bf 100644
--- a/tensorflow/contrib/lite/nnapi_delegate.cc
+++ b/tensorflow/contrib/lite/nnapi_delegate.cc
@@ -234,7 +234,10 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
           next_id++;
         };
 
-    auto add_add_params = [&add_scalar_int32]() { add_scalar_int32(0); };
+    auto add_add_params = [&add_scalar_int32](void* data) {
+      auto* builtin = reinterpret_cast<TfLiteAddParams*>(data);
+      add_scalar_int32(builtin->activation);
+    };
 
     auto add_pooling_params = [&add_scalar_int32](void* data) {
       auto builtin = reinterpret_cast<TfLitePoolParams*>(data);
@@ -345,11 +348,11 @@ void AddOpsAndParams(tflite::Interpreter* interpreter,
     switch (builtin) {
       case tflite::BuiltinOperator_ADD:
         nn_op_type = ANEURALNETWORKS_ADD;
-        add_add_params();
+        add_add_params(node.builtin_data);
         break;
       case tflite::BuiltinOperator_MUL:
         nn_op_type = ANEURALNETWORKS_MUL;
-        add_add_params();
+        add_add_params(node.builtin_data);
         break;
       case tflite::BuiltinOperator_AVERAGE_POOL_2D:
         add_pooling_params(node.builtin_data);
-- 
GitLab


From 5357d13d2bdca2fcd3779d0e8ea4aab5d2e73c21 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 20:06:25 -0700
Subject: [PATCH 582/610] Rollback of changelist checking for static shapes for
 model function. END_PUBLIC

BEGIN_PUBLIC
Automated g4 rollback of changelist 200139880

PiperOrigin-RevId: 200155130
---
 .../contrib/tpu/python/tpu/tpu_estimator.py   | 47 -------------------
 1 file changed, 47 deletions(-)

diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 2521522752..64ae35dfc5 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -1343,55 +1343,8 @@ class _ModelFnWrapper(object):
                 key, tensor))
     return predictions
 
-  def _validate_model_features_and_labels(self,
-                                          features,
-                                          labels,
-                                          is_export_mode):
-    """Validates that the features and labels for the model function are valid.
-
-    A valid features/labels object is the one with:
-    - Type: Tensor or a dictionary of Tensors
-    - Static shape if is_export_mode is False.
-
-    Args:
-      features: the features that would be input to the model function.
-      labels: the labels that would be input to the model function.
-      is_export_mode: boolean value specifying if in export mode.
-
-    Raises:
-      TypeError: If features/labels are not of the correct type.
-      ValueError: If features/labels have dynamic shape.
-    """
-
-    def validate(obj, obj_name):
-      """Helper validate function."""
-      if not isinstance(obj, ops.Tensor) and not isinstance(obj, dict):
-        raise TypeError(
-            'The {} to the model returned by input_fn must be either a Tensor '
-            'or a dictionary of Tensors. {}: {}'.format(obj_name, obj_name,
-                                                        obj))
-      if is_export_mode:
-        return
-      if isinstance(obj, ops.Tensor):
-        if not obj.get_shape().is_fully_defined():
-          raise ValueError(
-              'The {} to the model returned by input_fn must have static shape.'
-              ' Tensor: {}'.format(obj_name, obj))
-      else:
-        for (key, tensor) in obj.items():
-          if not tensor.get_shape().is_fully_defined():
-            raise ValueError(
-                'The {} to the model returned by input_fn must have static '
-                'shape. Key: \'{}\', Tensor: {}'.format(
-                    obj_name, key, tensor))
-
-    validate(features, 'features')
-    if labels is not None:
-      validate(labels, 'labels')
-
   def _call_model_fn(self, features, labels, is_export_mode=False):
     """Calls the model_fn with required parameters."""
-    self._validate_model_features_and_labels(features, labels, is_export_mode)
     model_fn_args = function_utils.fn_args(self._model_fn)
     kwargs = {}
 
-- 
GitLab


From 51f2b9e2867dd3ddb736a093f36b786cec3217c5 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 11 Jun 2018 20:11:36 -0700
Subject: [PATCH 583/610] Exposes toco_flags and model_flags as optional
 parameters to allow fine grained control of conversion.

PiperOrigin-RevId: 200155520
---
 tensorflow/contrib/lite/python/convert.py | 72 +++++++++++++++--------
 tensorflow/contrib/lite/python/lite.py    |  1 +
 2 files changed, 48 insertions(+), 25 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index fce8ffb54a..c038c88945 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -111,29 +111,27 @@ def tensor_name(x):
   return x.name.split(":")[0]
 
 
-def toco_convert(input_data,
-                 input_tensors,
-                 output_tensors,
-                 inference_type=lite_constants.FLOAT,
-                 inference_input_type=None,
-                 input_format=lite_constants.TENSORFLOW_GRAPHDEF,
-                 output_format=lite_constants.TFLITE,
-                 quantized_input_stats=None,
-                 default_ranges_stats=None,
-                 drop_control_dependency=True,
-                 reorder_across_fake_quant=False,
-                 allow_custom_ops=False,
-                 change_concat_input_ranges=False,
-                 quantize_weights=False,
-                 dump_graphviz_dir=None,
-                 dump_graphviz_video=False):
-  """Convert a model using TOCO from `input_format` to `output_format`.
+def build_toco_convert_protos(input_tensors,
+                              output_tensors,
+                              inference_type=lite_constants.FLOAT,
+                              inference_input_type=None,
+                              input_format=lite_constants.TENSORFLOW_GRAPHDEF,
+                              output_format=lite_constants.TFLITE,
+                              quantized_input_stats=None,
+                              default_ranges_stats=None,
+                              drop_control_dependency=True,
+                              reorder_across_fake_quant=False,
+                              allow_custom_ops=False,
+                              change_concat_input_ranges=False,
+                              quantize_weights=False,
+                              dump_graphviz_dir=None,
+                              dump_graphviz_video=False):
+  """Builds protocol buffers describing a conversion of a model using TOCO.
 
   Typically this is to convert from TensorFlow GraphDef to TFLite, in which
   case the default `input_format` and `output_format` are sufficient.
 
   Args:
-    input_data: Input data (i.e. often `sess.graph_def`).
     input_tensors: List of input tensors. Type and shape are computed using
       `foo.get_shape()` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
@@ -180,8 +178,8 @@ def toco_convert(input_data,
       every graph transformation. (default False)
 
   Returns:
-    The converted data. For example if TFLite was the destination, then
-    this will be a tflite flatbuffer in a bytes array.
+    model_flags, toco_flags: two protocol buffers describing the conversion
+    process.
 
   Raises:
     ValueError: If the input tensor type is unknown
@@ -204,7 +202,6 @@ def toco_convert(input_data,
   if dump_graphviz_dir:
     toco.dump_graphviz_dir = dump_graphviz_dir
   toco.dump_graphviz_include_video = dump_graphviz_video
-
   model = _model_flags_pb2.ModelFlags()
   model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
@@ -233,10 +230,35 @@ def toco_convert(input_data,
 
   for output_tensor in output_tensors:
     model.output_arrays.append(tensor_name(output_tensor))
+  return model, toco
+
+
+def toco_convert(input_data, input_tensors, output_tensors, *args, **kwargs):
+  """"Convert a model using TOCO.
 
-  # TODO(aselle): Consider handling the case of allowing quantized
-  # inputs to be converted to float (via the toco.inference_input_type field).
-  data = toco_convert_protos(model.SerializeToString(),
-                             toco.SerializeToString(),
+  Typically this function is used to convert from TensorFlow GraphDef to TFLite.
+  Conversion can be customized by providing arguments that are forwarded to
+  `build_toco_convert_protos` (see documentation for details).
+
+  Args:
+    input_data: Input data (i.e. often `sess.graph_def`),
+    input_tensors: List of input tensors. Type and shape are computed using
+      `foo.get_shape()` and `foo.dtype`.
+    output_tensors: List of output tensors (only .name is used from this).
+    *args: See `build_toco_convert_protos`,
+    **kwargs: See `build_toco_convert_protos`.
+
+  Returns:
+    The converted data. For example if TFLite was the destination, then
+    this will be a tflite flatbuffer in a bytes array.
+
+  Raises:
+    Defined in `build_toco_convert_protos`.
+  """
+  model_flags, toco_flags = build_toco_convert_protos(input_tensors,
+                                                      output_tensors,
+                                                      *args, **kwargs)
+  data = toco_convert_protos(model_flags.SerializeToString(),
+                             toco_flags.SerializeToString(),
                              input_data.SerializeToString())
   return data
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 4fb88c1ad6..6b63c0ccef 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -36,6 +36,7 @@ from __future__ import print_function
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
 from tensorflow.contrib.lite.python import lite_constants as constants
+from tensorflow.contrib.lite.python.convert import build_toco_convert_protos  # pylint: disable=unused-import
 from tensorflow.contrib.lite.python.convert import tensor_name
 from tensorflow.contrib.lite.python.convert import toco_convert
 from tensorflow.contrib.lite.python.convert import toco_convert_protos  # pylint: disable=unused-import
-- 
GitLab


From f9ae897fdcba9d1f7aa4ed8e0514022f8e5e70f3 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <kramerb@google.com>
Date: Tue, 12 Jun 2018 01:34:20 -0700
Subject: [PATCH 584/610] [XLA:GPU] Check the reduce input shape when
 multi-output fusing reduces

Otherwise we can end up in a situation where incompatible reduces that happen
to have the same output shape are fused.

PiperOrigin-RevId: 200180013
---
 .../xla/service/gpu/multi_output_fusion.cc    |  8 +++--
 .../service/gpu/multi_output_fusion_test.cc   | 33 +++++++++++++++++++
 2 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index 86c5c4fb6f..942c254533 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -47,12 +47,16 @@ bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
         element_instr = fused_expression_root;
       }
     }
+    // Special handling of kReduce instructions -- the fusion
+    // applies to the first operand.
+    if (element_instr->opcode() == HloOpcode::kReduce) {
+      return element_instr->operand(0)->shape();
+    }
     return element_instr->shape();
   };
 
   // The elementwise output shapes must be the same (including layout)
-  return ShapeUtil::ShapeUtil::Equal(get_element_shape(instr1),
-                                     get_element_shape(instr2));
+  return ShapeUtil::Equal(get_element_shape(instr1), get_element_shape(instr2));
 }
 
 bool GpuMultiOutputFusion::IsProfitableOperand(HloInstruction* instr) {
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index d0b4c88487..5170cbc7e3 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -36,6 +36,11 @@ const char kModulePrefix[] = R"(
       scalar_lhs = f32[] parameter(0)
       scalar_rhs = f32[] parameter(1)
       ROOT add = f32[] add(scalar_lhs, scalar_rhs)
+    }
+    scalar_mul_computation {
+      scalar_lhs = f32[] parameter(0)
+      scalar_rhs = f32[] parameter(1)
+      ROOT mul = f32[] add(scalar_lhs, scalar_rhs)
     })";
 
 TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
@@ -67,6 +72,34 @@ TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceAndReduceFusion) {
               op::Tuple(op::Reduce(), op::Reduce()));
 }
 
+TEST_F(InstructionFusionTest, MultiOutputFusionDifferentReduceInputShapes) {
+  auto module = ParseHloString(tensorflow::strings::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p1.1 = f32[6400]{0} parameter(1)
+      mul = f32[6400]{0} multiply(p1.1, p1.1)
+      const.1 = f32[] parameter(0)
+      ROOT reduce.1 = f32[] reduce(p1.1, const.1), dimensions={0}, to_apply=scalar_add_computation
+    }
+
+    fused_computation_2 {
+      p1.2 = f32[6400]{0} parameter(1)
+      r1 = f32[64,100]{0,1} reshape(p1.2)
+      const.2 = f32[] parameter(0)
+      ROOT reduce.2 = f32[] reduce(r1, const.2), dimensions={1,0}, to_apply=scalar_mul_computation
+    }
+
+    ENTRY entry {
+      p0 = f32[] parameter(0)
+      p1 = f32[6400]{0} parameter(1)
+      const.2 = f32[] constant(1)
+      fusion.1 = f32[] fusion(p0, p1), kind=kInput, calls=fused_computation_1
+      fusion.2 = f32[] fusion(p0, p1), kind=kInput, calls=fused_computation_2
+      ROOT root = (f32[], f32[]) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  ASSERT_FALSE(GpuMultiOutputFusion().Run(module.get()).ValueOrDie());
+}
+
 TEST_F(InstructionFusionTest, MultiOutputFusionSiblingReduceFusions) {
   // Two sibling fusions with reduce instruction roots sharing the same input
   // param.
-- 
GitLab


From da88bfa02f6fb7071a41ff065ec9a918b1e0b1d6 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 01:52:52 -0700
Subject: [PATCH 585/610] Fixes documentation of multi_label_head to render
 accepted labels as markdown list

PiperOrigin-RevId: 200181836
---
 tensorflow/contrib/estimator/python/estimator/head.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/contrib/estimator/python/estimator/head.py b/tensorflow/contrib/estimator/python/estimator/head.py
index b798769d2c..9594e5132f 100644
--- a/tensorflow/contrib/estimator/python/estimator/head.py
+++ b/tensorflow/contrib/estimator/python/estimator/head.py
@@ -529,6 +529,7 @@ def multi_label_head(n_classes,
   applications, the shape is `[batch_size, n_classes]`.
 
   Labels can be:
+
   * A multi-hot tensor of shape `[D0, D1, ... DN, n_classes]`
   * An integer `SparseTensor` of class indices. The `dense_shape` must be
     `[D0, D1, ... DN, ?]` and the values within `[0, n_classes)`.
-- 
GitLab


From 433ac81400c788557001789f0a0c5a76a9b7e29c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 02:33:46 -0700
Subject: [PATCH 586/610] Modified Bessel functions of order zero and one.

The functions are tf.math.bessel_i0(x), tf.math.bessel_i0e(x), tf.math.bessel_i1(x) and tf.math.bessel_i1e(x). The exponentially scaled versions tf.math.bessel_i0e(x) and tf.math.bessel_i1e(x) are more numerically stable. This code wraps the implementation that was recently added to Eigen.

PiperOrigin-RevId: 200186968
---
 .../api_def/base_api/api_def_BesselI0e.pbtxt  | 10 +++
 .../api_def/base_api/api_def_BesselI1e.pbtxt  | 10 +++
 .../python_api/api_def_BesselI0e.pbtxt        |  4 ++
 .../python_api/api_def_BesselI1e.pbtxt        |  4 ++
 tensorflow/core/kernels/cwise_op_bessel.cc    | 29 +++++++++
 tensorflow/core/kernels/cwise_op_bessel.cu.cc | 27 ++++++++
 tensorflow/core/kernels/cwise_ops.h           |  6 ++
 tensorflow/core/ops/math_ops.cc               |  4 ++
 .../python/kernel_tests/cwise_ops_test.py     | 24 ++++++++
 tensorflow/python/ops/math_grad.py            | 29 +++++++++
 tensorflow/python/ops/math_ops.py             | 61 +++++++++++++++++++
 tensorflow/python/ops/special_math_ops.py     | 48 +++++++++++++++
 .../python/ops/special_math_ops_test.py       | 28 +++++++++
 .../tools/api/golden/tensorflow.math.pbtxt    | 16 +++++
 14 files changed, 300 insertions(+)
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
 create mode 100644 tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
 create mode 100644 tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
 create mode 100644 tensorflow/core/kernels/cwise_op_bessel.cc
 create mode 100644 tensorflow/core/kernels/cwise_op_bessel.cu.cc

diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
new file mode 100644
index 0000000000..08313cebb9
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI0e.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "BesselI0e"
+  summary: "Computes the Bessel i0e function of `x` element-wise."
+  description: <<END
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+
+This function is faster and numerically stabler than `bessel_i0(x)`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
new file mode 100644
index 0000000000..3e46a9506f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_BesselI1e.pbtxt
@@ -0,0 +1,10 @@
+op {
+  graph_op_name: "BesselI1e"
+  summary: "Computes the Bessel i1e function of `x` element-wise."
+  description: <<END
+Exponentially scaled modified Bessel function of order 0 defined as
+`bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+
+This function is faster and numerically stabler than `bessel_i1(x)`.
+END
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
new file mode 100644
index 0000000000..7965af4916
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI0e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
new file mode 100644
index 0000000000..dffd296f6d
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "BesselI1e"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/kernels/cwise_op_bessel.cc b/tensorflow/core/kernels/cwise_op_bessel.cc
new file mode 100644
index 0000000000..4372f56408
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_bessel.cc
@@ -0,0 +1,29 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/cwise_ops_common.h"
+
+namespace tensorflow {
+REGISTER3(UnaryOp, CPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, CPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+#if GOOGLE_CUDA
+REGISTER3(UnaryOp, GPU, "BesselI0e", functor::bessel_i0e, Eigen::half, float,
+          double);
+REGISTER3(UnaryOp, GPU, "BesselI1e", functor::bessel_i1e, Eigen::half, float,
+          double);
+#endif
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_bessel.cu.cc b/tensorflow/core/kernels/cwise_op_bessel.cu.cc
new file mode 100644
index 0000000000..30de8b1fdc
--- /dev/null
+++ b/tensorflow/core/kernels/cwise_op_bessel.cu.cc
@@ -0,0 +1,27 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#include "tensorflow/core/kernels/cwise_ops_gpu_common.cu.h"
+
+namespace tensorflow {
+namespace functor {
+DEFINE_UNARY3(bessel_i0e, Eigen::half, float, double);
+DEFINE_UNARY3(bessel_i1e, Eigen::half, float, double);
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index a80905d145..8b015df4e1 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -616,6 +616,12 @@ struct acos : base<T, Eigen::internal::scalar_acos_op<T>> {};
 template <typename T>
 struct atan : base<T, Eigen::internal::scalar_atan_op<T>> {};
 
+template <typename T>
+struct bessel_i0e : base<T, Eigen::internal::scalar_i0e_op<T>> {};
+
+template <typename T>
+struct bessel_i1e : base<T, Eigen::internal::scalar_i1e_op<T>> {};
+
 struct logical_not : base<bool, Eigen::internal::scalar_boolean_not_op<bool>> {
 };
 
diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 8c0b073ce4..1740fa152c 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -239,6 +239,10 @@ REGISTER_OP("Acos").UNARY();
 
 REGISTER_OP("Atan").UNARY();
 
+REGISTER_OP("BesselI0e").UNARY_REAL();
+
+REGISTER_OP("BesselI1e").UNARY_REAL();
+
 #undef UNARY
 #undef UNARY_REAL
 #undef UNARY_COMPLEX
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index 1128cd7a63..8a3e64b174 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -241,6 +241,12 @@ class UnaryOpTest(test.TestCase):
                       math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))
 
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
@@ -286,6 +292,12 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(x, np.arcsin, math_ops.asin)
     self._compareBoth(x, np.arccos, math_ops.acos)
     self._compareBoth(x, np.arctan, math_ops.atan)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))
 
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
@@ -334,6 +346,12 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(k, np.arcsin, math_ops.asin)
     self._compareBoth(k, np.arccos, math_ops.acos)
     self._compareBoth(k, np.tan, math_ops.tan)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))
 
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
@@ -370,6 +388,12 @@ class UnaryOpTest(test.TestCase):
                       math_ops.lgamma)
     self._compareBoth(x, np.vectorize(math.erf), math_ops.erf)
     self._compareBoth(x, np.vectorize(math.erfc), math_ops.erfc)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self._compareBoth(x, special.i0e, math_ops.bessel_i0e)
+      self._compareBoth(x, special.i1e, math_ops.bessel_i1e)
+    except ImportError as e:
+      tf_logging.warn("Cannot test special functions: %s" % str(e))
 
     self._compareBothSparse(x, np.abs, math_ops.abs)
     self._compareBothSparse(x, np.negative, math_ops.negative)
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 563c0b3ab3..a48b3c9395 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -620,6 +620,35 @@ def _DigammaGrad(op, grad):
     return grad * math_ops.polygamma(array_ops.constant(1, dtype=x.dtype), x)
 
 
+@ops.RegisterGradient("BesselI0e")
+def _BesselI0eGrad(op, grad):
+  """Compute gradient of bessel_i0e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    return grad * (math_ops.bessel_i1e(x) - math_ops.sign(x) * y)
+
+
+@ops.RegisterGradient("BesselI1e")
+def _BesselI1eGrad(op, grad):
+  """Compute gradient of bessel_i1e(x) with respect to its argument."""
+  x = op.inputs[0]
+  y = op.outputs[0]
+  with ops.control_dependencies([grad]):
+    # For x = 0, the correct gradient is 0.5.
+    # However, the main branch gives NaN because of the division by x, so
+    # we impute the gradient manually.
+    # An alternative solution is to express the gradient via bessel_i0e and
+    # bessel_i2e, but the latter is not yet implemented in Eigen.
+    eps = np.finfo(x.dtype.as_numpy_dtype).eps
+    zeros = array_ops.zeros_like(x)
+    x_is_not_tiny = math_ops.abs(x) > eps
+    safe_x = array_ops.where(x_is_not_tiny, x, eps + zeros)
+    dy_dx = math_ops.bessel_i0e(safe_x) - y * (
+        math_ops.sign(safe_x) + math_ops.reciprocal(safe_x))
+    return grad * array_ops.where(x_is_not_tiny, dy_dx, 0.5 + zeros)
+
+
 @ops.RegisterGradient("Igamma")
 def _IgammaGrad(op, grad):
   """Returns gradient of igamma(a, x) with respect to x."""
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index b4cedb1d46..e40481f3a7 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -2954,6 +2954,67 @@ def polyval(coeffs, x, name=None):
       p = c + p * x
     return p
 
+
+@tf_export("math.bessel_i0e")
+def bessel_i0e(x, name=None):
+  """Computes the Bessel i0e function of `x` element-wise.
+
+  Exponentially scaled modified Bessel function of order 0 defined as
+  `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
+
+  This function is faster and numerically stabler than `bessel_i0(x)`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i0e
+  @end_compatibility
+  """
+  with ops.name_scope(name, "bessel_i0e", [x]) as name:
+    if isinstance(x, sparse_tensor.SparseTensor):
+      x_i0e = gen_math_ops.bessel_i0e(x.values, name=name)
+      return sparse_tensor.SparseTensor(
+          indices=x.indices, values=x_i0e, dense_shape=x.dense_shape)
+    else:
+      return gen_math_ops.bessel_i0e(x, name=name)
+
+
+@tf_export("math.bessel_i1e")
+def bessel_i1e(x, name=None):
+  """Computes the Bessel i1e function of `x` element-wise.
+
+  Exponentially scaled modified Bessel function of order 1 defined as
+  `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
+
+  This function is faster and numerically stabler than `bessel_i1(x)`.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i1e
+  @end_compatibility
+  """
+  with ops.name_scope(name, "bessel_i1e", [x]) as name:
+    if isinstance(x, sparse_tensor.SparseTensor):
+      x_i1e = gen_math_ops.bessel_i1e(x.values, name=name)
+      return sparse_tensor.SparseTensor(
+          indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
+    else:
+      return gen_math_ops.bessel_i1e(x, name=name)
+
+
 # FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
 # 1.0 API so we leave these here for backwards compatibility.
 fft = gen_spectral_ops.fft
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index 6204adef3b..6d3a85e3fd 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -82,6 +82,54 @@ def lbeta(x, name='lbeta'):
     return result
 
 
+@tf_export('math.bessel_i0')
+def bessel_i0(x, name='bessel_i0'):
+  """Computes the Bessel i0 function of `x` element-wise.
+
+  Modified Bessel function of order 0.
+
+  It is preferable to use the numerically stabler function `i0e(x)` instead.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i0
+  @end_compatibility
+  """
+  with ops.name_scope(name, [x]):
+    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i0e(x)
+
+
+@tf_export('math.bessel_i1')
+def bessel_i1(x, name='bessel_i1'):
+  """Computes the Bessel i1 function of `x` element-wise.
+
+  Modified Bessel function of order 1.
+
+  It is preferable to use the numerically stabler function `i1e(x)` instead.
+
+  Args:
+    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.special.i1
+  @end_compatibility
+  """
+  with ops.name_scope(name, [x]):
+    return math_ops.exp(math_ops.abs(x)) * math_ops.bessel_i1e(x)
+
+
 @tf_export('einsum', 'linalg.einsum')
 def einsum(equation, *inputs, **kwargs):
   """A generalized contraction between tensors of arbitrary dimension.
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 6118b54293..19a566166a 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
 
 
 class LBetaTest(test.TestCase):
@@ -150,6 +151,33 @@ class LBetaTest(test.TestCase):
         self.assertEqual(expected_result.get_shape(), lbeta_x.get_shape())
 
 
+class BesselTest(test.TestCase):
+
+  def test_bessel_i0(self):
+    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
+    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(special.i0(x_single),
+                          self.evaluate(special_math_ops.bessel_i0(x_single)))
+      self.assertAllClose(special.i0(x_double),
+                          self.evaluate(special_math_ops.bessel_i0(x_double)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+  def test_bessel_i1(self):
+    x_single = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
+    x_double = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
+    try:
+      from scipy import special  # pylint: disable=g-import-not-at-top
+      self.assertAllClose(special.i1(x_single),
+                          self.evaluate(special_math_ops.bessel_i1(x_single)))
+      self.assertAllClose(special.i1(x_double),
+                          self.evaluate(special_math_ops.bessel_i1(x_double)))
+    except ImportError as e:
+      tf_logging.warn('Cannot test special functions: %s' % str(e))
+
+
 class EinsumTest(test.TestCase):
 
   simple_cases = [
diff --git a/tensorflow/tools/api/golden/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
index 897718c05e..03fbf6266d 100644
--- a/tensorflow/tools/api/golden/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.math.pbtxt
@@ -1,5 +1,21 @@
 path: "tensorflow.math"
 tf_module {
+  member_method {
+    name: "bessel_i0"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'bessel_i0\'], "
+  }
+  member_method {
+    name: "bessel_i0e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "bessel_i1"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'bessel_i1\'], "
+  }
+  member_method {
+    name: "bessel_i1e"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "polyval"
     argspec: "args=[\'coeffs\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From 4102ccf85ba197a5c9b9de641969d41a9fd0f839 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 03:04:10 -0700
Subject: [PATCH 587/610] Remove unused variable from
 HloComputation::MakeInstructionPostOrder

PiperOrigin-RevId: 200189642
---
 tensorflow/compiler/xla/service/hlo_computation.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 763d9d2269..b158f44923 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -357,7 +357,6 @@ std::list<HloInstruction*> HloComputation::MakeInstructionPostOrder() const {
   std::list<HloInstruction*> post_order;
   std::list<HloInstruction*> trace_instructions;
   tensorflow::gtl::FlatSet<HloInstruction*> added_instructions;
-  std::vector<HloInstruction> dfs_stack;
   for (auto& instruction : instructions_) {
     if (instruction->opcode() == HloOpcode::kTrace) {
       // Trace instructions aren't handled by the DFS visitor. Add trace
-- 
GitLab


From 52911a4fb12671abf6cdbe27d6c07753380ea25a Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 03:20:10 -0700
Subject: [PATCH 588/610] Update ops-related pbtxt files.

PiperOrigin-RevId: 200191144
---
 .../core/ops/compat/ops_history.v1.pbtxt      | 46 +++++++++++++++++++
 tensorflow/core/ops/ops.pbtxt                 | 46 +++++++++++++++++++
 2 files changed, 92 insertions(+)

diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index b48686d9a3..726bfd63b7 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -10085,6 +10085,52 @@ op {
     }
   }
 }
+op {
+  name: "BesselI0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselI1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Betainc"
   input_arg {
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index dd3a6cd22c..c609703bcb 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -3860,6 +3860,52 @@ op {
     }
   }
 }
+op {
+  name: "BesselI0e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
+op {
+  name: "BesselI1e"
+  input_arg {
+    name: "x"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "y"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+}
 op {
   name: "Betainc"
   input_arg {
-- 
GitLab


From c07a963a16668168e2b478a33877e85888ab6262 Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Tue, 12 Jun 2018 03:23:56 -0700
Subject: [PATCH 589/610] Fix one unused C++ BUILD dependency found in
 tensorflow/compiler/xla/service/BUILD.

PiperOrigin-RevId: 200191374
---
 tensorflow/compiler/xla/service/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 6801012cc9..1154eef80e 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -2379,7 +2379,6 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_pass",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
     ],
 )
-- 
GitLab


From 1f1e88a681d5d6dea966033acf9b7e235913a35f Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 03:46:05 -0700
Subject: [PATCH 590/610] Go: Update generated wrapper functions for TensorFlow
 ops. PiperOrigin-RevId: 200192844

---
 tensorflow/go/op/wrappers.go | 1016 +++++++++++++++++-----------------
 1 file changed, 508 insertions(+), 508 deletions(-)

diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 76db602902..5602775b62 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -4210,69 +4210,6 @@ func Digamma(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Shuffle dimensions of x according to a permutation.
-//
-// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
-//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
-func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Transpose",
-		Input: []tf.Input{
-			x, perm,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MinAttr is an optional argument to Min.
-type MinAttr func(optionalAttr)
-
-// MinKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func MinKeepDims(value bool) MinAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the minimum of elements across dimensions of a tensor.
-//
-// Reduces `input` along the dimensions given in `axis`. Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `axis`. If `keep_dims` is true, the reduced dimensions are
-// retained with length 1.
-//
-// Arguments:
-//	input: The tensor to reduce.
-//	axis: The dimensions to reduce. Must be in the range
-// `[-rank(input), rank(input))`.
-//
-// Returns The reduced tensor.
-func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Min",
-		Input: []tf.Input{
-			input, axis,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Conv2DBackpropFilterAttr is an optional argument to Conv2DBackpropFilter.
 type Conv2DBackpropFilterAttr func(optionalAttr)
 
@@ -6181,6 +6118,77 @@ func Mod(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
+// Computes offsets of concat inputs within its output.
+//
+// For example:
+//
+// ```
+// # 'x' is [2, 2, 7]
+// # 'y' is [2, 3, 7]
+// # 'z' is [2, 5, 7]
+// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
+// ```
+//
+// This is typically used by gradient computations for a concat operation.
+//
+// Arguments:
+//	concat_dim: The dimension along which to concatenate.
+//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//
+// Returns The `N` int32 vectors representing the starting offset
+// of input tensors within the concatenated output.
+func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatOffset",
+		Input: []tf.Input{
+			concat_dim, tf.OutputList(shape),
+		},
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
+		scope.UpdateErr("ConcatOffset", err)
+		return
+	}
+	return offset
+}
+
+// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
+//
+// The lower regularized incomplete Gamma function is defined as:
+//
+//
+// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
+//
+// where
+//
+// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
+//
+// is the lower incomplete Gamma function.
+//
+// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
+// Gamma function.
+func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Igamma",
+		Input: []tf.Input{
+			a, x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // DepthToSpaceAttr is an optional argument to DepthToSpace.
 type DepthToSpaceAttr func(optionalAttr)
 
@@ -7000,6 +7008,69 @@ func BiasAddV1(scope *Scope, value tf.Output, bias tf.Output) (output tf.Output)
 	return op.Output(0)
 }
 
+// Shuffle dimensions of x according to a permutation.
+//
+// The output `y` has the same rank as `x`. The shapes of `x` and `y` satisfy:
+//   `y.shape[i] == x.shape[perm[i]] for i in [0, 1, ..., rank(x) - 1]`
+func Transpose(scope *Scope, x tf.Output, perm tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Transpose",
+		Input: []tf.Input{
+			x, perm,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MinAttr is an optional argument to Min.
+type MinAttr func(optionalAttr)
+
+// MinKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func MinKeepDims(value bool) MinAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the minimum of elements across dimensions of a tensor.
+//
+// Reduces `input` along the dimensions given in `axis`. Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `axis`. If `keep_dims` is true, the reduced dimensions are
+// retained with length 1.
+//
+// Arguments:
+//	input: The tensor to reduce.
+//	axis: The dimensions to reduce. Must be in the range
+// `[-rank(input), rank(input))`.
+//
+// Returns The reduced tensor.
+func Min(scope *Scope, input tf.Output, axis tf.Output, optional ...MinAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Min",
+		Input: []tf.Input{
+			input, axis,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
@@ -11592,60 +11663,6 @@ func SparseDenseCwiseMul(scope *Scope, sp_indices tf.Output, sp_values tf.Output
 	return op.Output(0)
 }
 
-// ResizeAreaAttr is an optional argument to ResizeArea.
-type ResizeAreaAttr func(optionalAttr)
-
-// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
-//
-// value: If true, the centers of the 4 corner pixels of the input and output tensors are
-// aligned, preserving the values at the corner pixels. Defaults to false.
-// If not specified, defaults to false
-func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
-	return func(m optionalAttr) {
-		m["align_corners"] = value
-	}
-}
-
-// Resize `images` to `size` using area interpolation.
-//
-// Input images can be of different types but output images are always float.
-//
-// The range of pixel values for the output image might be slightly different
-// from the range for the input image because of limited numerical precision.
-// To guarantee an output range, for example `[0.0, 1.0]`, apply
-// `tf.clip_by_value` to the output.
-//
-// Each output pixel is computed by first transforming the pixel's footprint into
-// the input tensor and then averaging the pixels that intersect the footprint. An
-// input pixel's contribution to the average is weighted by the fraction of its
-// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
-//
-// Arguments:
-//	images: 4-D with shape `[batch, height, width, channels]`.
-//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
-// new size for the images.
-//
-// Returns 4-D with shape
-// `[batch, new_height, new_width, channels]`.
-func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResizeArea",
-		Input: []tf.Input{
-			images, size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // 2D real-valued fast Fourier transform.
 //
 // Computes the 2-dimensional discrete Fourier transform of a real-valued signal
@@ -13635,170 +13652,6 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
-// ComplexAttr is an optional argument to Complex.
-type ComplexAttr func(optionalAttr)
-
-// ComplexTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_COMPLEX64
-func ComplexTout(value tf.DataType) ComplexAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Converts two real numbers to a complex number.
-//
-// Given a tensor `real` representing the real part of a complex number, and a
-// tensor `imag` representing the imaginary part of a complex number, this
-// operation returns complex numbers elementwise of the form \\(a + bj\\), where
-// *a* represents the `real` part and *b* represents the `imag` part.
-//
-// The input tensors `real` and `imag` must have the same shape.
-//
-// For example:
-//
-// ```
-// # tensor 'real' is [2.25, 3.25]
-// # tensor `imag` is [4.75, 5.75]
-// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
-// ```
-func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Complex",
-		Input: []tf.Input{
-			real, imag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ImagAttr is an optional argument to Imag.
-type ImagAttr func(optionalAttr)
-
-// ImagTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func ImagTout(value tf.DataType) ImagAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the imaginary part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the imaginary part of each element in `input`. All
-// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
-// is the real part and *b* is the imaginary part returned by this operation.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.imag(input) ==> [4.75, 5.75]
-// ```
-func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Imag",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes the maximum along segments of a tensor.
-//
-// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
-// segments.
-//
-// Computes a tensor such that
-// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
-// that `segment_ids[j] == i`.
-//
-// If the max is empty for a given segment ID `i`, `output[i] = 0`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
-// </div>
-//
-// Arguments:
-//
-//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
-// first dimension.  Values should be sorted and can be repeated.
-//
-// Returns Has same shape as data, except for dimension 0 which
-// has size `k`, the number of segments.
-func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SegmentMax",
-		Input: []tf.Input{
-			data, segment_ids,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes hyperbolic tangent of `x` element-wise.
-func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Tanh",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a dataset that skips `count` elements from the `input_dataset`.
-//
-// Arguments:
-//
-//	count: A scalar representing the number of elements from the `input_dataset`
-// that should be skipped.  If count is -1, skips everything.
-//
-//
-func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SkipDataset",
-		Input: []tf.Input{
-			input_dataset, count,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Compute the Hurwitz zeta function \\(\zeta(x, q)\\).
 //
 // The Hurwitz zeta function is defined as:
@@ -14064,49 +13917,6 @@ func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// RealAttr is an optional argument to Real.
-type RealAttr func(optionalAttr)
-
-// RealTout sets the optional Tout attribute to value.
-// If not specified, defaults to DT_FLOAT
-func RealTout(value tf.DataType) RealAttr {
-	return func(m optionalAttr) {
-		m["Tout"] = value
-	}
-}
-
-// Returns the real part of a complex number.
-//
-// Given a tensor `input` of complex numbers, this operation returns a tensor of
-// type `float` that is the real part of each element in `input`. All elements in
-// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
-//  part returned by this operation and *b* is the imaginary part.
-//
-// For example:
-//
-// ```
-// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
-// tf.real(input) ==> [-2.25, 3.25]
-// ```
-func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Real",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // AudioSummaryAttr is an optional argument to AudioSummary.
 type AudioSummaryAttr func(optionalAttr)
 
@@ -19518,66 +19328,348 @@ func UnsortedSegmentProd(scope *Scope, data tf.Output, segment_ids tf.Output, nu
 	opspec := tf.OpSpec{
 		Type: "UnsortedSegmentProd",
 		Input: []tf.Input{
-			data, segment_ids, num_segments,
+			data, segment_ids, num_segments,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
+
+// RandomUniformIntSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random integers from a uniform distribution.
+//
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
+//
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
+//
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomUniformInt",
+		Input: []tf.Input{
+			shape, minval, maxval,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// RandomShuffleAttr is an optional argument to RandomShuffle.
+type RandomShuffleAttr func(optionalAttr)
+
+// RandomShuffleSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomShuffleSeed(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomShuffleSeed2(value int64) RandomShuffleAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Randomly shuffles a tensor along its first dimension.
+//
+//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
+//   to one and only one `output[i]`. For example, a mapping that might occur for a
+//   3x2 tensor is:
+//
+// ```
+// [[1, 2],       [[5, 6],
+//  [3, 4],  ==>   [1, 2],
+//  [5, 6]]        [3, 4]]
+// ```
+//
+// Arguments:
+//	value: The tensor to be shuffled.
+//
+// Returns A tensor of same shape and type as `value`, shuffled along its first
+// dimension.
+func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomShuffle",
+		Input: []tf.Input{
+			value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
+type OrderedMapIncompleteSizeAttr func(optionalAttr)
+
+// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ComplexAttr is an optional argument to Complex.
+type ComplexAttr func(optionalAttr)
+
+// ComplexTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_COMPLEX64
+func ComplexTout(value tf.DataType) ComplexAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Converts two real numbers to a complex number.
+//
+// Given a tensor `real` representing the real part of a complex number, and a
+// tensor `imag` representing the imaginary part of a complex number, this
+// operation returns complex numbers elementwise of the form \\(a + bj\\), where
+// *a* represents the `real` part and *b* represents the `imag` part.
+//
+// The input tensors `real` and `imag` must have the same shape.
+//
+// For example:
+//
+// ```
+// # tensor 'real' is [2.25, 3.25]
+// # tensor `imag` is [4.75, 5.75]
+// tf.complex(real, imag) ==> [[2.25 + 4.75j], [3.25 + 5.75j]]
+// ```
+func Complex(scope *Scope, real tf.Output, imag tf.Output, optional ...ComplexAttr) (out tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Complex",
+		Input: []tf.Input{
+			real, imag,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ImagAttr is an optional argument to Imag.
+type ImagAttr func(optionalAttr)
+
+// ImagTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func ImagTout(value tf.DataType) ImagAttr {
+	return func(m optionalAttr) {
+		m["Tout"] = value
+	}
+}
+
+// Returns the imaginary part of a complex number.
+//
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the imaginary part of each element in `input`. All
+// elements in `input` must be complex numbers of the form \\(a + bj\\), where *a*
+// is the real part and *b* is the imaginary part returned by this operation.
+//
+// For example:
+//
+// ```
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.imag(input) ==> [4.75, 5.75]
+// ```
+func Imag(scope *Scope, input tf.Output, optional ...ImagAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Imag",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Computes the maximum along segments of a tensor.
+//
+// Read @{$math_ops#Segmentation$the section on segmentation} for an explanation of
+// segments.
+//
+// Computes a tensor such that
+// \\(output_i = \max_j(data_j)\\) where `max` is over `j` such
+// that `segment_ids[j] == i`.
+//
+// If the max is empty for a given segment ID `i`, `output[i] = 0`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src="https://www.tensorflow.org/images/SegmentMax.png" alt>
+// </div>
+//
+// Arguments:
+//
+//	segment_ids: A 1-D tensor whose rank is equal to the rank of `data`'s
+// first dimension.  Values should be sorted and can be repeated.
+//
+// Returns Has same shape as data, except for dimension 0 which
+// has size `k`, the number of segments.
+func SegmentMax(scope *Scope, data tf.Output, segment_ids tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SegmentMax",
+		Input: []tf.Input{
+			data, segment_ids,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
+// Computes hyperbolic tangent of `x` element-wise.
+func Tanh(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
+	opspec := tf.OpSpec{
+		Type: "Tanh",
+		Input: []tf.Input{
+			x,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Outputs random integers from a uniform distribution.
+// Creates a dataset that skips `count` elements from the `input_dataset`.
 //
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
+// Arguments:
 //
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
+//	count: A scalar representing the number of elements from the `input_dataset`
+// that should be skipped.  If count is -1, skips everything.
 //
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
+func SkipDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
+		Type: "SkipDataset",
 		Input: []tf.Input{
-			shape, minval, maxval,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -19585,49 +19677,31 @@ func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf
 	return op.Output(0)
 }
 
-// RandomShuffleAttr is an optional argument to RandomShuffle.
-type RandomShuffleAttr func(optionalAttr)
+// RealAttr is an optional argument to Real.
+type RealAttr func(optionalAttr)
 
-// RandomShuffleSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomShuffleSeed(value int64) RandomShuffleAttr {
+// RealTout sets the optional Tout attribute to value.
+// If not specified, defaults to DT_FLOAT
+func RealTout(value tf.DataType) RealAttr {
 	return func(m optionalAttr) {
-		m["seed"] = value
+		m["Tout"] = value
 	}
 }
 
-// RandomShuffleSeed2 sets the optional seed2 attribute to value.
+// Returns the real part of a complex number.
 //
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomShuffleSeed2(value int64) RandomShuffleAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Randomly shuffles a tensor along its first dimension.
+// Given a tensor `input` of complex numbers, this operation returns a tensor of
+// type `float` that is the real part of each element in `input`. All elements in
+// `input` must be complex numbers of the form \\(a + bj\\), where *a* is the real
+//  part returned by this operation and *b* is the imaginary part.
 //
-//   The tensor is shuffled along dimension 0, such that each `value[j]` is mapped
-//   to one and only one `output[i]`. For example, a mapping that might occur for a
-//   3x2 tensor is:
+// For example:
 //
 // ```
-// [[1, 2],       [[5, 6],
-//  [3, 4],  ==>   [1, 2],
-//  [5, 6]]        [3, 4]]
+// # tensor 'input' is [-2.25 + 4.75j, 3.25 + 5.75j]
+// tf.real(input) ==> [-2.25, 3.25]
 // ```
-//
-// Arguments:
-//	value: The tensor to be shuffled.
-//
-// Returns A tensor of same shape and type as `value`, shuffled along its first
-// dimension.
-func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr) (output tf.Output) {
+func Real(scope *Scope, input tf.Output, optional ...RealAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19636,9 +19710,9 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "RandomShuffle",
+		Type: "Real",
 		Input: []tf.Input{
-			value,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -19646,57 +19720,54 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// OrderedMapIncompleteSizeAttr is an optional argument to OrderedMapIncompleteSize.
-type OrderedMapIncompleteSizeAttr func(optionalAttr)
+// ResizeAreaAttr is an optional argument to ResizeArea.
+type ResizeAreaAttr func(optionalAttr)
 
-// OrderedMapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// ResizeAreaAlignCorners sets the optional align_corners attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeCapacity(value int64) OrderedMapIncompleteSizeAttr {
+// value: If true, the centers of the 4 corner pixels of the input and output tensors are
+// aligned, preserving the values at the corner pixels. Defaults to false.
+// If not specified, defaults to false
+func ResizeAreaAlignCorners(value bool) ResizeAreaAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["align_corners"] = value
 	}
 }
 
-// OrderedMapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Resize `images` to `size` using area interpolation.
 //
-// REQUIRES: value >= 0
-func OrderedMapIncompleteSizeMemoryLimit(value int64) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeContainer(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func OrderedMapIncompleteSizeSharedName(value string) OrderedMapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func OrderedMapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...OrderedMapIncompleteSizeAttr) (size tf.Output) {
+// Input images can be of different types but output images are always float.
+//
+// The range of pixel values for the output image might be slightly different
+// from the range for the input image because of limited numerical precision.
+// To guarantee an output range, for example `[0.0, 1.0]`, apply
+// `tf.clip_by_value` to the output.
+//
+// Each output pixel is computed by first transforming the pixel's footprint into
+// the input tensor and then averaging the pixels that intersect the footprint. An
+// input pixel's contribution to the average is weighted by the fraction of its
+// area that intersects the footprint.  This is the same as OpenCV's INTER_AREA.
+//
+// Arguments:
+//	images: 4-D with shape `[batch, height, width, channels]`.
+//	size: = A 1-D int32 Tensor of 2 elements: `new_height, new_width`.  The
+// new size for the images.
+//
+// Returns 4-D with shape
+// `[batch, new_height, new_width, channels]`.
+func ResizeArea(scope *Scope, images tf.Output, size tf.Output, optional ...ResizeAreaAttr) (resized_images tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "OrderedMapIncompleteSize",
-
+		Type: "ResizeArea",
+		Input: []tf.Input{
+			images, size,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -30639,74 +30710,3 @@ func UnravelIndex(scope *Scope, indices tf.Output, dims tf.Output) (output tf.Ou
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
-
-// Compute the lower regularized incomplete Gamma function `Q(a, x)`.
-//
-// The lower regularized incomplete Gamma function is defined as:
-//
-//
-// \\(P(a, x) = gamma(a, x) / Gamma(a) = 1 - Q(a, x)\\)
-//
-// where
-//
-// \\(gamma(a, x) = int_{0}^{x} t^{a-1} exp(-t) dt\\)
-//
-// is the lower incomplete Gamma function.
-//
-// Note, above `Q(a, x)` (`Igammac`) is the upper regularized complete
-// Gamma function.
-func Igamma(scope *Scope, a tf.Output, x tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Igamma",
-		Input: []tf.Input{
-			a, x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes offsets of concat inputs within its output.
-//
-// For example:
-//
-// ```
-// # 'x' is [2, 2, 7]
-// # 'y' is [2, 3, 7]
-// # 'z' is [2, 5, 7]
-// concat_offset(2, [x, y, z]) => [0, 0, 0], [0, 2, 0], [0, 5, 0]
-// ```
-//
-// This is typically used by gradient computations for a concat operation.
-//
-// Arguments:
-//	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
-//
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
-func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ConcatOffset",
-		Input: []tf.Input{
-			concat_dim, tf.OutputList(shape),
-		},
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if offset, idx, err = makeOutputList(op, idx, "offset"); err != nil {
-		scope.UpdateErr("ConcatOffset", err)
-		return
-	}
-	return offset
-}
-- 
GitLab


From 7076ae10ed39d7e1870595347e11f3a99b9410d0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 05:15:55 -0700
Subject: [PATCH 591/610] Unify cuDNN descriptor wrapper names. No functional
 changes.

PiperOrigin-RevId: 200199956
---
 tensorflow/stream_executor/cuda/cuda_dnn.cc | 255 ++++++++++----------
 1 file changed, 124 insertions(+), 131 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 48afc06e32..d4f2fd2625 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -495,10 +495,10 @@ PersistentRnnPlan CreatePersistentRnnPlan(cudnnRNNDescriptor_t rnn_desc,
 
 // Turns a BatchDescriptor structure into a cudnn tensor handle within a
 // scope.
-class ScopedTensorDescriptor {
+class CudnnTensorDescriptor {
  public:
-  ScopedTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
-                         cudnnDataType_t elem_type)
+  CudnnTensorDescriptor(const dnn::BatchDescriptor& batch_descriptor,
+                        cudnnDataType_t elem_type)
       : handle_(CreateTensorDescriptor()) {
     switch (batch_descriptor.layout()) {
       case dnn::DataLayout::kBatchYXDepth:
@@ -540,15 +540,15 @@ class ScopedTensorDescriptor {
  private:
   TensorDescriptor handle_;
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedTensorDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnTensorDescriptor);
 };
 
 // Turns a FilterDescriptor structure into a cudnn filter handle within a
 // scope.
-class ScopedFilterDescriptor {
+class CudnnFilterDescriptor {
  public:
-  ScopedFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
-                         cudnnDataType_t elem_type)
+  CudnnFilterDescriptor(const dnn::FilterDescriptor& filter_descriptor,
+                        cudnnDataType_t elem_type)
       : handle_(CreateFilterDescriptor()) {
     // TODO(b/23032134): Even if the filter layout is not supported,
     // cudnnSetFilter4DDescriptor_v4 will return CUDNN_STATUS_SUCCESS because
@@ -586,7 +586,7 @@ class ScopedFilterDescriptor {
  private:
   FilterDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedFilterDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnFilterDescriptor);
 };
 
 // A helper function to decide whether to enable the TENSOR_OP_MATH math type
@@ -636,9 +636,9 @@ bool BatchnormSpatialPersistentEnabled() {
 
 // Turns a ConvolutionDescriptor structure into a cudnn convolution handle
 // within a scope.
-class ScopedConvolutionDescriptor {
+class CudnnConvolutionDescriptor {
  public:
-  ScopedConvolutionDescriptor(
+  CudnnConvolutionDescriptor(
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       cudnnDataType_t data_type)
       : handle_(CreateConvolutionDescriptor()) {
@@ -700,14 +700,14 @@ class ScopedConvolutionDescriptor {
  private:
   ConvolutionDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedConvolutionDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnConvolutionDescriptor);
 };
 
 // Turns a PoolingDescriptor structure into a cudnn pooling descriptor handle
 // within a scope.
-class ScopedPoolingDescriptor {
+class CudnnPoolingDescriptor {
  public:
-  explicit ScopedPoolingDescriptor(
+  explicit CudnnPoolingDescriptor(
       const dnn::PoolingDescriptor& pooling_descriptor)
       : handle_(CreatePoolingDescriptor()) {
     const std::vector<int64> strides64 = pooling_descriptor.strides();
@@ -739,13 +739,13 @@ class ScopedPoolingDescriptor {
  private:
   PoolingDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnPoolingDescriptor);
 };
 
 // Turns a NormalizeDescriptor structure into a cudnn LRN descriptor handle.
-class ScopedNormalizeDescriptor {
+class CudnnNormalizeDescriptor {
  public:
-  explicit ScopedNormalizeDescriptor(
+  explicit CudnnNormalizeDescriptor(
       const dnn::NormalizeDescriptor& normalize_descriptor)
       : handle_(CreateLrnDescriptor()) {
     // The range specifies that the indices in the closed range
@@ -777,16 +777,16 @@ class ScopedNormalizeDescriptor {
  private:
   LrnDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedNormalizeDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnNormalizeDescriptor);
 };
 
 // Turns a ActivationDescriptor structure into a cudnn activation
 // descriptor handle within a scope.
-class ScopedActivationDescriptor {
+class CudnnActivationDescriptor {
  public:
-  ScopedActivationDescriptor(dnn::ActivationMode activation_mode,
-                             cudnnNanPropagation_t nan_propagation,
-                             double value_max)
+  CudnnActivationDescriptor(dnn::ActivationMode activation_mode,
+                            cudnnNanPropagation_t nan_propagation,
+                            double value_max)
       : handle_(CreateActivationDescriptor()) {
     double relu_ceiling = 0.0;
     cudnnActivationMode_t mode;
@@ -822,7 +822,7 @@ class ScopedActivationDescriptor {
  private:
   ActivationDescriptor handle_;  // Owned.
 
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedActivationDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnActivationDescriptor);
 };
 
 cudnnDataType_t ToCudnnDataType(
@@ -888,21 +888,21 @@ int CudnnDataTypeToByteSize(cudnnDataType_t data_type) {
   }
 }
 
-class ScopedDropoutDescriptor {
-  explicit ScopedDropoutDescriptor(DropoutDescriptor handle)
+class CudnnDropoutDescriptor {
+  explicit CudnnDropoutDescriptor(DropoutDescriptor handle)
       : handle_(std::move(handle)) {}
 
  public:
-  ScopedDropoutDescriptor(ScopedDropoutDescriptor&&) = default;
+  CudnnDropoutDescriptor(CudnnDropoutDescriptor&&) = default;
 
-  static port::StatusOr<ScopedDropoutDescriptor> Create(
+  static port::StatusOr<CudnnDropoutDescriptor> Create(
       const CudnnHandle& cudnn, float dropout, uint64 seed,
       ScratchAllocator* state_allocator) {
     DropoutDescriptor handle = CreateDropoutDescriptor();
 
     if (dropout == 0.0f) {
       // Return 'empty' dropout descriptor.
-      return ScopedDropoutDescriptor(std::move(handle));
+      return CudnnDropoutDescriptor(std::move(handle));
     }
 
     DeviceMemory<uint8> state_memory;
@@ -917,14 +917,14 @@ class ScopedDropoutDescriptor {
         handle.get(), cudnn.handle(), dropout, state_memory.opaque(),
         state_memory.size(), seed));
 
-    return ScopedDropoutDescriptor(std::move(handle));
+    return CudnnDropoutDescriptor(std::move(handle));
   }
 
   cudnnDropoutDescriptor_t handle() const { return handle_.get(); }
 
  private:
   DropoutDescriptor handle_;  // Owned.
-  SE_DISALLOW_COPY_AND_ASSIGN(ScopedDropoutDescriptor);
+  SE_DISALLOW_COPY_AND_ASSIGN(CudnnDropoutDescriptor);
 };
 
 class CudnnRnnParamsDescriptor {
@@ -973,7 +973,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
                      cudnnRNNMode_t rnn_mode, cudnnDataType_t data_type,
                      cudnnDataType_t compute_type,
                      const dnn::AlgorithmConfig& algorithm_config,
-                     ScopedDropoutDescriptor dropout_desc,
+                     CudnnDropoutDescriptor dropout_desc,
                      CudnnRnnParamsDescriptor params_desc)
       : rnn_desc_(std::move(rnn_desc)),
         rnn_plan_(std::move(rnn_plan)),
@@ -1002,8 +1002,8 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
       const dnn::AlgorithmConfig& algorithm_config, float dropout, uint64 seed,
       ScratchAllocator* state_allocator) {
     SE_ASSIGN_OR_RETURN(
-        ScopedDropoutDescriptor dropout_desc,
-        ScopedDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
+        CudnnDropoutDescriptor dropout_desc,
+        CudnnDropoutDescriptor::Create(cudnn, dropout, seed, state_allocator));
 
     cuda::RnnDescriptor rnn_desc = CreateRnnDescriptor();
     cudnnRNNAlgo_t rnn_algo = ToCudnnRNNAlgo(algorithm_config.algorithm());
@@ -1097,7 +1097,7 @@ class CudnnRnnDescriptor : public dnn::RnnDescriptor {
   cudnnDataType_t data_type_;
   cudnnDataType_t compute_type_;
   dnn::AlgorithmConfig algorithm_config_;
-  ScopedDropoutDescriptor dropout_desc_;
+  CudnnDropoutDescriptor dropout_desc_;
   CudnnRnnParamsDescriptor params_desc_;
   SE_DISALLOW_COPY_AND_ASSIGN(CudnnRnnDescriptor);
 };
@@ -1926,10 +1926,9 @@ namespace {
 // and backward filter.
 
 port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
-    const CudnnHandle& cudnn, const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd, bool specify_workspace_limit,
+    const CudnnHandle& cudnn, const CudnnTensorDescriptor& input_nd,
+    const CudnnFilterDescriptor& filter, const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, bool specify_workspace_limit,
     size_t memory_limit_bytes) {
   cudnnConvolutionFwdPreference_t preference =
       specify_workspace_limit ? CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT
@@ -1943,10 +1942,10 @@ port::StatusOr<cudnnConvolutionFwdAlgo_t> GetCudnnConvolutionForwardAlgo(
 
 port::StatusOr<cudnnConvolutionBwdDataAlgo_t>
 GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
-                                    const ScopedTensorDescriptor& input_nd,
-                                    const ScopedFilterDescriptor& filter,
-                                    const ScopedConvolutionDescriptor& conv,
-                                    const ScopedTensorDescriptor& output_nd,
+                                    const CudnnTensorDescriptor& input_nd,
+                                    const CudnnFilterDescriptor& filter,
+                                    const CudnnConvolutionDescriptor& conv,
+                                    const CudnnTensorDescriptor& output_nd,
                                     bool specify_workspace_limit,
                                     size_t memory_limit_bytes) {
   cudnnConvolutionBwdDataPreference_t preference =
@@ -1962,10 +1961,10 @@ GetCudnnConvolutionBackwardDataAlgo(const CudnnHandle& cudnn,
 
 port::StatusOr<cudnnConvolutionBwdFilterAlgo_t>
 GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
-                                      const ScopedTensorDescriptor& input_nd,
-                                      const ScopedFilterDescriptor& filter,
-                                      const ScopedConvolutionDescriptor& conv,
-                                      const ScopedTensorDescriptor& output_nd,
+                                      const CudnnTensorDescriptor& input_nd,
+                                      const CudnnFilterDescriptor& filter,
+                                      const CudnnConvolutionDescriptor& conv,
+                                      const CudnnTensorDescriptor& output_nd,
                                       bool specify_workspace_limit,
                                       size_t memory_limit_bytes) {
   cudnnConvolutionBwdFilterPreference_t preference =
@@ -1982,10 +1981,9 @@ GetCudnnConvolutionBackwardFilterAlgo(const CudnnHandle& cudnn,
 port::StatusOr<DeviceMemory<uint8>> AllocateCudnnConvolutionForwardWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmDesc& algorithm_desc,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
@@ -2025,10 +2023,9 @@ port::StatusOr<DeviceMemory<uint8>>
 AllocateCudnnConvolutionBackwardDataWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmDesc& algorithm_desc,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
@@ -2070,10 +2067,9 @@ port::StatusOr<DeviceMemory<uint8>>
 AllocateCudnnConvolutionBackwardFilterWorkspace(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmDesc& algorithm_desc,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd,
     ScratchAllocator* scratch_allocator) {
   // TODO(csigg): This has side effects on the convolution descriptor. It is
   // functionally correct because the convolution is run with the algorithm of
@@ -2114,11 +2110,10 @@ AllocateCudnnConvolutionBackwardFilterWorkspace(
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch) {
   dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
   if (algorithm_config.algorithm().is_default()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
@@ -2164,11 +2159,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionForwardAlgorithm(
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch) {
   dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
   if (algorithm_config.algorithm().is_default()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
@@ -2214,11 +2208,10 @@ port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardDataAlgorithm(
 port::StatusOr<dnn::AlgorithmDesc> GetCudnnConvolutionBackwardFilterAlgorithm(
     Stream* stream, const CudnnHandle& cudnn,
     const dnn::AlgorithmConfig& algorithm_config,
-    const ScopedTensorDescriptor& input_nd,
-    const ScopedFilterDescriptor& filter,
-    const ScopedConvolutionDescriptor& conv,
-    const ScopedTensorDescriptor& output_nd,
-    ScratchAllocator* scratch_allocator, DeviceMemory<uint8>* scratch) {
+    const CudnnTensorDescriptor& input_nd, const CudnnFilterDescriptor& filter,
+    const CudnnConvolutionDescriptor& conv,
+    const CudnnTensorDescriptor& output_nd, ScratchAllocator* scratch_allocator,
+    DeviceMemory<uint8>* scratch) {
   dnn::AlgorithmDesc algo_desc = algorithm_config.algorithm();
   if (algorithm_config.algorithm().is_default()) {
     // Pick fastest algorithm within memory limit according to cuDNN's
@@ -2387,11 +2380,11 @@ port::Status CudnnSupport::DoConvolveImpl(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  ScopedTensorDescriptor output_nd(output_descriptor, cudnn_type);
-  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
-  ScopedConvolutionDescriptor conv(convolution_descriptor,
-                                   GetConvComputeType<T>());
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  GetConvComputeType<T>());
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -2493,14 +2486,14 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
                         "Relu activation.");
   }
 
-  ScopedTensorDescriptor conv_input_nd(
+  CudnnTensorDescriptor conv_input_nd(
       conv_input_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
-  ScopedTensorDescriptor output_nd(
+  CudnnTensorDescriptor output_nd(
       output_descriptor, static_cast<cudnnDataType_t>(cudnn_data_type));
-  ScopedFilterDescriptor filter(filter_descriptor,
-                                static_cast<cudnnDataType_t>(cudnn_data_type));
-  ScopedTensorDescriptor bias_nd(bias_descriptor, CUDNN_DATA_FLOAT);
-  ScopedConvolutionDescriptor conv(
+  CudnnFilterDescriptor filter(filter_descriptor,
+                               static_cast<cudnnDataType_t>(cudnn_data_type));
+  CudnnTensorDescriptor bias_nd(bias_descriptor, CUDNN_DATA_FLOAT);
+  CudnnConvolutionDescriptor conv(
       convolution_descriptor, static_cast<cudnnDataType_t>(cudnn_compute_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
@@ -2528,7 +2521,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
   // activation descriptor. Note that this will change the nan propagation
   // behavior from separate conv, bias, and relu (which by default is
   // CUDNN_PROPAGATE_NAN.
-  ScopedActivationDescriptor activation_desc(
+  CudnnActivationDescriptor activation_desc(
       activation_mode, CUDNN_NOT_PROPAGATE_NAN, output_descriptor.value_max());
   auto side_input_data_ptr = (side_input_scale == 0) ? output_data->opaque()
                                                      : side_input_data.opaque();
@@ -2740,8 +2733,8 @@ port::Status CudnnSupport::DoBatchNormalizationForwardImpl(
     DeviceMemory<U>* saved_mean, DeviceMemory<U>* saved_inv_var,
     bool is_training, std::function<const DeviceMemory<U>&()> var_to_inv_var,
     std::function<void()> inv_var_to_var) {
-  ScopedTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
-  ScopedTensorDescriptor scale_offset_descriptor(
+  CudnnTensorDescriptor x_descriptor(x_desc, ToCudnnDataType(input_data_type));
+  CudnnTensorDescriptor scale_offset_descriptor(
       scale_offset_desc, ToCudnnDataType(scale_data_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
@@ -2825,9 +2818,9 @@ port::Status CudnnSupport::DoBatchNormalizationBackwardImpl(
     const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
     DeviceMemory<T>* x_backprop, DeviceMemory<U>* scale_backprop,
     DeviceMemory<U>* offset_backprop) {
-  ScopedTensorDescriptor x_descriptor(
+  CudnnTensorDescriptor x_descriptor(
       x_desc, static_cast<cudnnDataType_t>(cudnn_input_type));
-  ScopedTensorDescriptor scale_offset_descriptor(
+  CudnnTensorDescriptor scale_offset_descriptor(
       scale_offset_desc, static_cast<cudnnDataType_t>(cudnn_scale_type));
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
 #if CUDNN_VERSION >= 7000
@@ -3017,9 +3010,9 @@ bool CudnnSupport::DoTransformTensor(Stream* stream,
                                      dnn::DataType output_type, float scale,
                                      DeviceMemoryBase* output_data) {
   float beta = 0.0f;
-  ScopedTensorDescriptor input_tensor_desc(
+  CudnnTensorDescriptor input_tensor_desc(
       input_desc, ToCudnnDataType(input_type, input_desc.layout()));
-  ScopedTensorDescriptor output_tensor_desc(
+  CudnnTensorDescriptor output_tensor_desc(
       output_desc, ToCudnnDataType(output_type, output_desc.layout()));
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
@@ -3056,11 +3049,11 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  ScopedTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
-  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
-  ScopedConvolutionDescriptor conv(convolution_descriptor,
-                                   GetConvComputeType<T>());
+  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3192,11 +3185,11 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
-  ScopedTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
-  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  ScopedFilterDescriptor filter(filter_descriptor, cudnn_type);
-  ScopedConvolutionDescriptor conv(convolution_descriptor,
-                                   GetConvComputeType<T>());
+  CudnnTensorDescriptor out_back_nd(output_descriptor, cudnn_type);
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
+  CudnnConvolutionDescriptor conv(convolution_descriptor,
+                                  GetConvComputeType<T>());
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3338,8 +3331,8 @@ port::Status CudnnSupport::DoConvolveBackwardBiasImpl(
     const dnn::BatchDescriptor& bias_descriptor,
     DeviceMemory<T>* backward_bias_data) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
-  ScopedTensorDescriptor input_nd(input_descriptor, cudnn_type);
-  ScopedTensorDescriptor bias_nd(bias_descriptor, cudnn_type);
+  CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
+  CudnnTensorDescriptor bias_nd(bias_descriptor, cudnn_type);
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0;
@@ -3526,7 +3519,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
                              const DeviceMemory<float>& biases,
                              const dnn::BatchDescriptor& dimensions,
                              DeviceMemory<float>* output_data) {
-  ScopedTensorDescriptor input_descriptor(dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor input_descriptor(dimensions, CUDNN_DATA_FLOAT);
 
   dnn::BatchDescriptor bias_dimensions;
   bias_dimensions.set_count(1)
@@ -3534,7 +3527,7 @@ bool CudnnSupport::DoBiasAdd(Stream* stream,
       .set_height(1)
       .set_width(1)
       .set_layout(dnn::DataLayout::kBatchYXDepth);
-  ScopedTensorDescriptor bias_descriptor(bias_dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor bias_descriptor(bias_dimensions, CUDNN_DATA_FLOAT);
 
   // cudnnAddTensor after R3 is in-place, so we need to copy input_data to
   // output_data before doing the addition, unless the input and
@@ -3570,10 +3563,10 @@ bool CudnnSupport::DoActivate(Stream* stream,
                               const DeviceMemory<float>& input_data,
                               DeviceMemory<float>* output_data,
                               uint64 options) {
-  ScopedActivationDescriptor activation_desc(
+  CudnnActivationDescriptor activation_desc(
       activation_mode, CUDNN_PROPAGATE_NAN, dimensions.value_max());
 
-  ScopedTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor input_nd(dimensions, CUDNN_DATA_FLOAT);
   // Alpha is the input scaling factor.
   float alpha = 1.0;
   // Beta is the output scaling factor.
@@ -3600,9 +3593,9 @@ bool CudnnSupport::DoPoolForward(
   // Beta is the scaling factor for output.
   double beta = 0.0;
 
-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
@@ -3625,9 +3618,9 @@ bool CudnnSupport::DoPoolForward(
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
@@ -3650,9 +3643,9 @@ bool CudnnSupport::DoPoolForward(
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
     RETURN_IF_CUDNN_ERROR(cudnnPoolingForward(
@@ -3676,9 +3669,9 @@ bool CudnnSupport::DoPoolBackward(
   // Beta is the scaling factor for output.
   double beta = 0.0;
 
-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_DOUBLE);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
@@ -3705,9 +3698,9 @@ bool CudnnSupport::DoPoolBackward(
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_FLOAT);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
@@ -3734,9 +3727,9 @@ bool CudnnSupport::DoPoolBackward(
   // Beta is the scaling factor for output.
   float beta = 0.0;
 
-  ScopedTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
-  ScopedTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
-  ScopedPoolingDescriptor pooling_desc(pooling_dimensions);
+  CudnnTensorDescriptor src_desc(input_dimensions, CUDNN_DATA_HALF);
+  CudnnTensorDescriptor dest_desc(output_dimensions, CUDNN_DATA_HALF);
+  CudnnPoolingDescriptor pooling_desc(pooling_dimensions);
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto status = [&] {
@@ -3771,8 +3764,8 @@ bool CudnnSupport::DoNormalizeWithDimensions(
     return false;
   }
 
-  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
-  ScopedNormalizeDescriptor normalize(normalize_descriptor);
+  CudnnTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  CudnnNormalizeDescriptor normalize(normalize_descriptor);
 
   // Alpha is the scaling factor for input.
   float alpha = 1.0f;
@@ -3808,8 +3801,8 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
     return false;
   }
 
-  ScopedTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
-  ScopedNormalizeDescriptor normalize(normalize_descriptor);
+  CudnnTensorDescriptor dims(dimensions, CUDNN_DATA_FLOAT);
+  CudnnNormalizeDescriptor normalize(normalize_descriptor);
 
   float alpha = 1.0f;
   float beta = 0.0f;
@@ -3932,9 +3925,9 @@ bool CudnnSupport::DeriveOutputBatchDescriptor(
     const dnn::FilterDescriptor& filter_descriptor,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     dnn::BatchDescriptor* output_batch_descriptor) {
-  ScopedTensorDescriptor input_nd(batch_descriptor, CUDNN_DATA_FLOAT);
-  ScopedFilterDescriptor filter(filter_descriptor, CUDNN_DATA_FLOAT);
-  ScopedConvolutionDescriptor conv(convolution_descriptor, CUDNN_DATA_FLOAT);
+  CudnnTensorDescriptor input_nd(batch_descriptor, CUDNN_DATA_FLOAT);
+  CudnnFilterDescriptor filter(filter_descriptor, CUDNN_DATA_FLOAT);
+  CudnnConvolutionDescriptor conv(convolution_descriptor, CUDNN_DATA_FLOAT);
 
   int dn = batch_descriptor.ndims() + 2;
   std::vector<int> dims(dn);  // in BDYX
-- 
GitLab


From cba0c951587bbf93144e4821013dbf5ae6cb5efe Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 05:20:27 -0700
Subject: [PATCH 592/610] Remove OS X code from CUDA stream executor because
 that platform is no longer supported.

PiperOrigin-RevId: 200200356
---
 .../stream_executor/cuda/cuda_diagnostics.cc  | 90 +------------------
 .../stream_executor/cuda/cuda_gpu_executor.cc | 16 +---
 2 files changed, 5 insertions(+), 101 deletions(-)

diff --git a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
index 46e5deed84..10f6d21d54 100644
--- a/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
+++ b/tensorflow/stream_executor/cuda/cuda_diagnostics.cc
@@ -24,17 +24,12 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#ifdef __APPLE__
-#include <IOKit/kext/KextManager.h>
-#include <mach-o/dyld.h>
-#else
 #if !defined(PLATFORM_WINDOWS)
 #include <link.h>
 #include <sys/sysmacros.h>
 #include <unistd.h>
 #endif
 #include <sys/stat.h>
-#endif
 #include <algorithm>
 #include <memory>
 #include <vector>
@@ -54,9 +49,7 @@ limitations under the License.
 namespace stream_executor {
 namespace cuda {
 
-#ifdef __APPLE__
-static const CFStringRef kDriverKextIdentifier = CFSTR("com.nvidia.CUDA");
-#elif !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS)
 static const char *kDriverVersionPath = "/proc/driver/nvidia/version";
 #endif
 
@@ -121,26 +114,7 @@ string Diagnostician::GetDevNodePath(int dev_node_ordinal) {
 }
 
 void Diagnostician::LogDiagnosticInformation() {
-#ifdef __APPLE__
-  CFStringRef kext_ids[1];
-  kext_ids[0] = kDriverKextIdentifier;
-  CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks);
-  CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
-  CFRelease(kext_id_query);
-
-  CFDictionaryRef cuda_driver_info = nullptr;
-  if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
-    bool started = CFBooleanGetValue((CFBooleanRef)CFDictionaryGetValue(cuda_driver_info, CFSTR("OSBundleStarted")));
-    if (!started) {
-      LOG(INFO) << "kernel driver is installed, but does not appear to be running on this host "
-                << "(" << port::Hostname() << ")";
-    }
-  } else {
-    LOG(INFO) << "kernel driver does not appear to be installed on this host "
-              << "(" << port::Hostname() << ")";
-  }
-  CFRelease(kext_infos);
-#elif !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS)
   if (access(kDriverVersionPath, F_OK) != 0) {
     LOG(INFO) << "kernel driver does not appear to be running on this host "
               << "(" << port::Hostname() << "): "
@@ -194,8 +168,7 @@ void Diagnostician::LogDiagnosticInformation() {
 	  << DriverVersionStatusToString(kernel_version);
 #endif
 
-  // OS X kernel driver does not report version accurately
-#if !defined(__APPLE__) && !defined(PLATFORM_WINDOWS)
+#if !defined(PLATFORM_WINDOWS)
   if (kernel_version.ok() && dso_version.ok()) {
     WarnOnDsoKernelMismatch(dso_version, kernel_version);
   }
@@ -209,29 +182,6 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
       port::error::NOT_FOUND,
       "was unable to find libcuda.so DSO loaded into this program"));
 
-#if defined(__APPLE__)
-    // OSX CUDA libraries have names like: libcuda_310.41.15_mercury.dylib
-    const string prefix("libcuda_");
-    const string suffix("_mercury.dylib");
-    for (uint32_t image_index = 0; image_index < _dyld_image_count(); ++image_index) {
-      const string path(_dyld_get_image_name(image_index));
-      const size_t suffix_pos = path.rfind(suffix);
-      const size_t prefix_pos = path.rfind(prefix, suffix_pos);
-      if (prefix_pos == string::npos ||
-          suffix_pos == string::npos) {
-        // no match
-        continue;
-      }
-      const size_t start = prefix_pos + prefix.size();
-      if (start >= suffix_pos) {
-        // version not included
-        continue;
-      }
-      const size_t length = suffix_pos - start;
-      const string version = path.substr(start, length);
-      result = StringToDriverVersion(version);
-    }
-#else
 #if !defined(PLATFORM_WINDOWS) && !defined(ANDROID_TEGRA)
   // Callback used when iterating through DSOs. Looks for the driver-interfacing
   // DSO and yields its version number into the callback data, when found.
@@ -264,7 +214,6 @@ port::StatusOr<DriverVersion> Diagnostician::FindDsoVersion() {
   };
 
   dl_iterate_phdr(iterate_phdr, &result);
-#endif
 #endif
 
   return result;
@@ -310,38 +259,7 @@ void Diagnostician::WarnOnDsoKernelMismatch(
 
 
 port::StatusOr<DriverVersion> Diagnostician::FindKernelDriverVersion() {
-#if defined(__APPLE__)
-  CFStringRef kext_ids[1];
-  kext_ids[0] = kDriverKextIdentifier;
-  CFArrayRef kext_id_query = CFArrayCreate(nullptr, (const void**)kext_ids, 1, &kCFTypeArrayCallBacks);
-  CFDictionaryRef kext_infos = KextManagerCopyLoadedKextInfo(kext_id_query, nullptr);
-  CFRelease(kext_id_query);
-
-  CFDictionaryRef cuda_driver_info = nullptr;
-  if (CFDictionaryGetValueIfPresent(kext_infos, kDriverKextIdentifier, (const void**)&cuda_driver_info)) {
-    // NOTE: OSX CUDA driver does not currently store the same driver version
-    // in kCFBundleVersionKey as is returned by cuDriverGetVersion
-    CFRelease(kext_infos);
-    const CFStringRef str = (CFStringRef)CFDictionaryGetValue(
-        cuda_driver_info, kCFBundleVersionKey);
-    const char *version = CFStringGetCStringPtr(str, kCFStringEncodingUTF8);
-
-    // version can be NULL in which case treat it as empty string
-    // see
-    // https://developer.apple.com/library/mac/documentation/CoreFoundation/Conceptual/CFStrings/Articles/AccessingContents.html#//apple_ref/doc/uid/20001184-100980-TPXREF112
-    if (version == NULL) {
-      return StringToDriverVersion("");
-    }
-    return StringToDriverVersion(version);
-  }
-  CFRelease(kext_infos);
-  auto status = port::Status(
-      port::error::INTERNAL,
-      port::StrCat(
-          "failed to read driver bundle version: ",
-          CFStringGetCStringPtr(kDriverKextIdentifier, kCFStringEncodingUTF8)));
-  return status;
-#elif defined(PLATFORM_WINDOWS)
+#if defined(PLATFORM_WINDOWS)
   auto status =
       port::Status(port::error::UNIMPLEMENTED,
                    "kernel reported driver version not implemented on Windows");
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index f2be68bc42..edf217875f 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -15,9 +15,6 @@ limitations under the License.
 
 #include "tensorflow/stream_executor/cuda/cuda_gpu_executor.h"
 
-#if defined(__APPLE__)
-#include <mach-o/dyld.h>
-#endif
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
 #define PATH_MAX MAX_PATH
@@ -179,19 +176,11 @@ bool CUDAExecutor::FindOnDiskForComputeCapability(
 //                 would return /usr/bin.
 static string GetBinaryDir(bool strip_exe) {
   char exe_path[PATH_MAX] = {0};
-#if defined(__APPLE__)
-    uint32_t buffer_size = 0U;
-    _NSGetExecutablePath(nullptr, &buffer_size);
-    char unresolved_path[buffer_size];
-    _NSGetExecutablePath(unresolved_path, &buffer_size);
-    CHECK_ERR(realpath(unresolved_path, exe_path) ? 1 : -1);
-#else
 #if defined(PLATFORM_WINDOWS)
   HMODULE hModule = GetModuleHandle(NULL);
   GetModuleFileName(hModule, exe_path, MAX_PATH);
 #else
   CHECK_ERR(readlink("/proc/self/exe", exe_path, sizeof(exe_path) - 1));
-#endif
 #endif
   // Make sure it's null-terminated:
   exe_path[sizeof(exe_path) - 1] = 0;
@@ -854,10 +843,7 @@ CudaContext* CUDAExecutor::cuda_context() { return context_; }
 // For anything more complicated/prod-focused than this, you'll likely want to
 // turn to gsys' topology modeling.
 static int TryToReadNumaNode(const string &pci_bus_id, int device_ordinal) {
-#if defined(__APPLE__)
-  LOG(INFO) << "OS X does not support NUMA - returning NUMA node zero";
-  return 0;
-#elif defined(PLATFORM_WINDOWS)
+#if defined(PLATFORM_WINDOWS)
   // Windows support for NUMA is not currently implemented. Return node 0.
   return 0;
 #elif defined(__aarch64__)
-- 
GitLab


From 507c48d876d716cec8e112f5062d2842a964206c Mon Sep 17 00:00:00 2001
From: Yun Peng <pcloudy@google.com>
Date: Tue, 12 Jun 2018 14:58:56 +0200
Subject: [PATCH 593/610] Disable tensorflow/python/estimator:keras_test on
 Windows (#19902)

* Disable tensorflow/python/estimator:keras_test on Windows
---
 tensorflow/python/estimator/BUILD | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tensorflow/python/estimator/BUILD b/tensorflow/python/estimator/BUILD
index c0d63b79a6..9e716e81f4 100644
--- a/tensorflow/python/estimator/BUILD
+++ b/tensorflow/python/estimator/BUILD
@@ -975,7 +975,10 @@ py_test(
     size = "large",
     srcs = ["keras_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/core:protos_all_py",
-- 
GitLab


From c241e9bc57d1f3855d55d440ebbe4189fae6ea8b Mon Sep 17 00:00:00 2001
From: hsm207 <hsm207@users.noreply.github.com>
Date: Tue, 12 Jun 2018 21:05:08 +0800
Subject: [PATCH 594/610] Fix typo (#19923)

---
 .../eager/python/examples/notebooks/4_high_level.ipynb        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
index 4fe3a0e3f3..5749f22ac5 100644
--- a/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
+++ b/tensorflow/contrib/eager/python/examples/notebooks/4_high_level.ipynb
@@ -68,7 +68,7 @@
         "# simply construct the object. Most layers take as a first argument the number\n",
         "# of output dimensions / channels.\n",
         "layer = tf.keras.layers.Dense(100)\n",
-        "# The number of input dimensionss is often unnecessary, as it can be inferred\n",
+        "# The number of input dimensions is often unnecessary, as it can be inferred\n",
         "# the first time the layer is used, but it can be provided if you want to \n",
         "# specify it manually, which is useful in some complex models.\n",
         "layer = tf.keras.layers.Dense(10, input_shape=(None, 5))"
@@ -267,7 +267,7 @@
         "  * `build`, where you know the shapes of the input tensors and can do the rest of the initialization\n",
         "  * `call`, where you do the forward computation\n",
         "\n",
-        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes requires to create the variables will need to be explicitly specified."
+        "Note that you don't have to wait until `build` is called to create your variables, you can also create them in `__init__`. However, the advantage of creating them in `build` is that it enables late variable creation based on the shape of the inputs the layer will operate on. On the other hand, creating variables in `__init__` would mean that shapes required to create the variables will need to be explicitly specified."
       ]
     },
     {
-- 
GitLab


From 8e7ae1c8c78cebc7cc98cb99b3f8a3e8a415b5ff Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 07:02:51 -0700
Subject: [PATCH 595/610] Automated g4 rollback of changelist 197218170

PiperOrigin-RevId: 200209039
---
 tensorflow/contrib/distribute/python/BUILD    |  20 +
 .../distribute/python/metrics_v1_test.py      | 438 ++++++++++++++++++
 .../distribute/python/mirrored_strategy.py    |   8 +
 .../distribute/python/one_device_strategy.py  |   4 +
 tensorflow/python/BUILD                       |   1 +
 tensorflow/python/framework/test_util.py      |   8 +-
 tensorflow/python/ops/metrics_impl.py         | 296 ++++++++----
 tensorflow/python/training/distribute.py      |  26 +-
 8 files changed, 697 insertions(+), 104 deletions(-)
 create mode 100644 tensorflow/contrib/distribute/python/metrics_v1_test.py

diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index b572512bbb..9dfb8552f1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -77,6 +77,7 @@ py_library(
         "//tensorflow/python:device_util",
         "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
@@ -590,3 +591,22 @@ cuda_py_test(
         "notsan",
     ],
 )
+
+cuda_py_test(
+    name = "metrics_v1_test",
+    srcs = ["metrics_v1_test.py"],
+    additional_deps = [
+        ":combinations",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/contrib/data/python/ops:batching",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:test",
+    ],
+    tags = [
+        "multi_and_single_gpu",
+        "no_pip",
+    ],
+)
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
new file mode 100644
index 0000000000..6c6bf14309
--- /dev/null
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -0,0 +1,438 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for V1 metrics."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.contrib.data.python.ops import batching
+from tensorflow.contrib.distribute.python import combinations
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import test
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import variables
+
+
+def _labeled_dataset_fn():
+  # First four batches of x: labels, predictions -> (labels == predictions)
+  #  0: 0, 0 -> True;   1: 1, 1 -> True;   2: 2, 2 -> True;   3: 3, 0 -> False
+  #  4: 4, 1 -> False;  5: 0, 2 -> False;  6: 1, 0 -> False;  7: 2, 1 -> False
+  #  8: 3, 2 -> False;  9: 4, 0 -> False; 10: 0, 1 -> False; 11: 1, 2 -> False
+  # 12: 2, 0 -> False; 13: 3, 1 -> False; 14: 4, 2 -> False; 15: 0, 0 -> True
+  return dataset_ops.Dataset.range(1000).map(
+      lambda x: {"labels": x % 5, "predictions": x % 3}).batch(4)
+
+
+def _boolean_dataset_fn():
+  # First four batches of labels, predictions: {TP, FP, TN, FN}
+  # with a threshold of 0.5:
+  #   T, T -> TP;  F, T -> FP;   T, F -> FN
+  #   F, F -> TN;  T, T -> TP;   F, T -> FP
+  #   T, F -> FN;  F, F -> TN;   T, T -> TP
+  #   F, T -> FP;  T, F -> FN;   F, F -> TN
+  return dataset_ops.Dataset.from_tensor_slices({
+      "labels": [True, False, True, False],
+      "predictions": [True, True, False, False]}).repeat().batch(3)
+
+
+def _threshold_dataset_fn():
+  # First four batches of labels, predictions: {TP, FP, TN, FN}
+  # with a threshold of 0.5:
+  #   True, 1.0 -> TP;  False, .75 -> FP;   True, .25 -> FN
+  #  False, 0.0 -> TN;   True, 1.0 -> TP;  False, .75 -> FP
+  #   True, .25 -> FN;  False, 0.0 -> TN;   True, 1.0 -> TP
+  #  False, .75 -> FP;   True, .25 -> FN;  False, 0.0 -> TN
+  return dataset_ops.Dataset.from_tensor_slices({
+      "labels": [True, False, True, False],
+      "predictions": [1.0, 0.75, 0.25, 0.]}).repeat().batch(3)
+
+
+def _regression_dataset_fn():
+  return dataset_ops.Dataset.from_tensor_slices({
+      "labels": [1., .5, 1., 0.],
+      "predictions": [1., .75, .25, 0.]}).repeat()
+
+
+def all_combinations():
+  return combinations.combine(
+      distribution=[combinations.default_strategy,
+                    combinations.one_device_strategy,
+                    combinations.mirrored_strategy_with_gpu_and_cpu,
+                    combinations.mirrored_strategy_with_two_gpus],
+      mode=["graph"])
+
+
+# TODO(josh11b): Test metrics.recall_at_top_k, metrics.average_precision_at_k,
+# metrics.precision_at_k
+class MetricsV1Test(test.TestCase, parameterized.TestCase):
+
+  def _test_metric(self, distribution, dataset_fn, metric_fn, expected_fn):
+    with ops.Graph().as_default(), distribution.scope():
+      iterator = distribution.distribute_dataset(
+          dataset_fn).make_one_shot_iterator()
+      value, update = distribution.call_for_each_tower(
+          metric_fn, iterator.get_next())
+      update = distribution.group(update)
+      self.evaluate(variables.local_variables_initializer())
+      # TODO(josh11b): Once we switch to using a global batch size for input,
+      # replace "distribution.num_towers" with "1".
+      batches_per_update = distribution.num_towers
+
+      # Update variables using the first `num_towers` batches.
+      self.evaluate(update)
+      self.assertAllClose(expected_fn(batches_per_update), self.evaluate(value),
+                          0.001, msg="After first update")
+
+      # Update variables using the second `num_towers` batches.
+      self.evaluate(update)
+      self.assertAllClose(expected_fn(2 * batches_per_update),
+                          self.evaluate(value),
+                          0.001,
+                          msg="After second update")
+
+      if batches_per_update == 1:  # Consume 4 input batches
+        self.evaluate(update)
+        self.assertAllClose(expected_fn(3 * batches_per_update),
+                            self.evaluate(value),
+                            0.001,
+                            msg="After third update")
+        self.evaluate(update)
+        self.assertAllClose(expected_fn(4 * batches_per_update),
+                            self.evaluate(value),
+                            0.001,
+                            msg="After fourth update")
+
+  @combinations.generate(all_combinations())
+  def testMean(self, distribution):
+    def _dataset_fn():
+      return dataset_ops.Dataset.range(1000).map(math_ops.to_float).batch(4)
+
+    def _expected_fn(num_batches):
+      # Mean(0..3) = 1.5, Mean(0..7) = 3.5, Mean(0..11) = 5.5, etc.
+      return num_batches * 2 - 0.5
+
+    self._test_metric(distribution, _dataset_fn, metrics.mean, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testAccuracy(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.accuracy(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [3./4, 3./8, 3./12, 4./16][num_batches - 1]
+
+    self._test_metric(
+        distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanPerClassAccuracy(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.mean_per_class_accuracy(
+          labels, predictions, num_classes=5)
+
+    def _expected_fn(num_batches):
+      mean = lambda x: sum(x) / len(x)
+      return [mean([1., 1., 1., 0., 0.]),
+              mean([0.5, 0.5, 0.5, 0., 0.]),
+              mean([1./3, 1./3, 0.5, 0., 0.]),
+              mean([0.5, 1./3, 1./3, 0., 0.])][num_batches - 1]
+
+    self._test_metric(
+        distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanIOU(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.mean_iou(
+          labels, predictions, num_classes=5)
+
+    def _expected_fn(num_batches):
+      mean = lambda x: sum(x) / len(x)
+      return [mean([1./2, 1./1, 1./1, 0.]),  # no class 4 in first batch
+              mean([1./4, 1./4, 1./3, 0., 0.]),
+              mean([1./6, 1./6, 1./5, 0., 0.]),
+              mean([2./8, 1./7, 1./7, 0., 0.])][num_batches - 1]
+
+    self._test_metric(
+        distribution, _labeled_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanTensor(self, distribution):
+    def _dataset_fn():
+      dataset = dataset_ops.Dataset.range(1000).map(math_ops.to_float)
+      # Want to produce a fixed, known shape, so drop remainder when batching.
+      dataset = dataset.apply(batching.batch_and_drop_remainder(4))
+      return dataset
+
+    def _expected_fn(num_batches):
+      # Mean(0, 4, ..., 4 * num_batches - 4) == 2 * num_batches - 2
+      # Mean(1, 5, ..., 4 * num_batches - 3) == 2 * num_batches - 1
+      # Mean(2, 6, ..., 4 * num_batches - 2) == 2 * num_batches
+      # Mean(3, 7, ..., 4 * num_batches - 1) == 2 * num_batches + 1
+      first = 2. * num_batches - 2.
+      return [first, first + 1., first + 2., first + 3.]
+
+    self._test_metric(
+        distribution, _dataset_fn, metrics.mean_tensor, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testAUCROC(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.auc(labels, predictions, num_thresholds=8, curve="ROC",
+                         summation_method="careful_interpolation")
+
+    def _expected_fn(num_batches):
+      return [0.5, 7./9, 0.8, 0.75][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testAUCPR(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.auc(labels, predictions, num_thresholds=8, curve="PR",
+                         summation_method="careful_interpolation")
+
+    def _expected_fn(num_batches):
+      return [0.797267, 0.851238, 0.865411, 0.797267][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalseNegatives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_negatives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [1., 1., 2., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalseNegativesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_negatives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[1.], [1.], [2.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTrueNegatives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_negatives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0., 1., 2., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTrueNegativesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_negatives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[0.], [1.], [2.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalsePositives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_positives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [1., 2., 2., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testFalsePositivesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.false_positives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[1.], [2.], [2.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTruePositives(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_positives(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [1., 2., 3., 3.][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testTruePositivesAtThresholds(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.true_positives_at_thresholds(labels, predictions, [.5])
+
+    def _expected_fn(num_batches):
+      return [[1.], [2.], [3.], [3.]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testPrecision(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.precision(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0.5, 0.5, 0.6, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testPrecisionAtThreshold(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.precision_at_thresholds(labels, predictions, [0.5])
+
+    def _expected_fn(num_batches):
+      return [[0.5], [0.5], [0.6], [0.5]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testRecall(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.recall(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0.5, 2./3, 0.6, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _boolean_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testRecallAtThreshold(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.recall_at_thresholds(labels, predictions, [0.5])
+
+    def _expected_fn(num_batches):
+      return [[0.5], [2./3], [0.6], [0.5]][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testMeanSquaredError(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.mean_squared_error(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0., 1./32, 0.208333, 0.15625][num_batches - 1]
+
+    self._test_metric(
+        distribution, _regression_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testRootMeanSquaredError(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.root_mean_squared_error(labels, predictions)
+
+    def _expected_fn(num_batches):
+      return [0., 0.176777, 0.456435, 0.395285][num_batches - 1]
+
+    self._test_metric(
+        distribution, _regression_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testSensitivityAtSpecificity(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.sensitivity_at_specificity(labels, predictions, 0.8)
+
+    def _expected_fn(num_batches):
+      return [0.5, 2./3, 0.6, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+  @combinations.generate(all_combinations())
+  def testSpecificityAtSensitivity(self, distribution):
+    def _metric_fn(x):
+      labels = x["labels"]
+      predictions = x["predictions"]
+      return metrics.specificity_at_sensitivity(labels, predictions, 0.95)
+
+    def _expected_fn(num_batches):
+      return [0., 1./3, 0.5, 0.5][num_batches - 1]
+
+    self._test_metric(
+        distribution, _threshold_dataset_fn, _metric_fn, _expected_fn)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index cef0a2907b..403e47d94f 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -31,6 +31,7 @@ from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import device_util
@@ -343,6 +344,13 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                         **values.select_device_mirrored(d, kwargs))
     return values.regroup(updates, values.Mirrored)
 
+  def read_var(self, tower_local_var):
+    """Read the aggregate value of a tower-local variable."""
+    if isinstance(tower_local_var, values.TowerLocalVariable):
+      return math_ops.add_n(self.unwrap(tower_local_var))
+    assert isinstance(tower_local_var, values.Mirrored)
+    return tower_local_var.get()
+
   def _fetch(self, val, destination, fn):
     """Return a copy of `val` or `fn(val)` on `destination`."""
     if isinstance(val, values.TowerLocalVariable):
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 09b6d4a515..6378af32bd 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -102,6 +102,10 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     with ops.device(self._device), distribute_lib.UpdateContext(self._device):
       return fn(*args, **kwargs)
 
+  def read_var(self, tower_local_var):
+    """Read the aggregate value of a tower-local variable."""
+    return tower_local_var
+
   def _fetch(self, val, destination, fn):
     """Return a copy of `val` or `fn(val)` on `destination`."""
     with ops.device(self._device):
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 86721cb856..a06b536f5b 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -2530,6 +2530,7 @@ py_library(
         ":check_ops",
         ":confusion_matrix",
         ":control_flow_ops",
+        ":distribute",
         ":framework",
         ":framework_for_generated_wrappers",
         ":math_ops",
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index 4a6146e0a6..5582b14249 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -1242,11 +1242,11 @@ class TensorFlowTestCase(googletest.TestCase):
             b,
             rtol=rtol,
             atol=atol,
-            msg="Mismatched value: a%s is different from b%s." % (path_str,
-                                                                  path_str))
+            msg=("Mismatched value: a%s is different from b%s. %s" %
+                 (path_str, path_str, msg)))
       except TypeError as e:
-        msg = "Error: a%s has %s, but b%s has %s" % (path_str, type(a),
-                                                     path_str, type(b))
+        msg = ("Error: a%s has %s, but b%s has %s. %s" %
+               (path_str, type(a), path_str, type(b), msg))
         e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
         raise
 
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index 47eea6ef6b..5eab12c41d 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -34,21 +34,54 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
 def metric_variable(shape, dtype, validate_shape=True, name=None):
-  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES`) collections."""
-
-  return variable_scope.variable(
-      lambda: array_ops.zeros(shape, dtype),
-      trainable=False,
-      collections=[
-          ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
-      ],
-      validate_shape=validate_shape,
-      name=name)
+  """Create variable in `GraphKeys.(LOCAL|METRIC_VARIABLES)` collections.
+
+  If running in a `DistributionStrategy` context, the variable will be
+  "tower local". This means:
+
+  *   The returned object will be a container with separate variables
+      per replica/tower of the model.
+
+  *   When writing to the variable, e.g. using `assign_add` in a metric
+      update, the update will be applied to the variable local to the
+      replica/tower.
+
+  *   To get a metric's result value, we need to sum the variable values
+      across the replicas/towers before computing the final answer.
+      Furthermore, the final answer should be computed once instead of
+      in every replica/tower. Both of these are accomplished by
+      running the computation of the final result value inside
+      `tf.contrib.distribute.get_tower_context().merge_call(fn)`.
+      Inside the `merge_call()`, ops are only added to the graph once
+      and access to a tower-local variable in a computation returns
+      the sum across all replicas/towers.
+
+  Args:
+    shape: Shape of the created variable.
+    dtype: Type of the created variable.
+    validate_shape: (Optional) Whether shape validation is enabled for
+      the created variable.
+    name: (Optional) String name of the created variable.
+
+  Returns:
+    A (non-trainable) variable initialized to zero, or if inside a
+    `DistributionStrategy` scope a tower-local variable container.
+  """
+  with distribute_lib.get_tower_context().tower_local_var_scope('sum'):
+    # Note that "tower local" implies trainable=False.
+    return variable_scope.variable(
+        lambda: array_ops.zeros(shape, dtype),
+        collections=[
+            ops.GraphKeys.LOCAL_VARIABLES, ops.GraphKeys.METRIC_VARIABLES
+        ],
+        validate_shape=validate_shape,
+        name=name)
 
 
 def _remove_squeezable_dimensions(predictions, labels, weights):
@@ -333,11 +366,15 @@ def mean(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    mean_t = _safe_div(total, count, 'value')
-    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
+    def aggregate_across_towers(_, t, c):
+      mean_t = _safe_div(t, c, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_t)
+      return mean_t
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_t)
+    mean_t = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, total, count)
+    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -572,6 +609,17 @@ def _confusion_matrix_at_thresholds(labels,
   return values, update_ops
 
 
+def _aggregate_variable(v, collections):
+
+  def f(distribution, value):
+    value = distribution.read_var(value)
+    if collections:
+      ops.add_to_collections(collections, value)
+    return value
+
+  return distribute_lib.get_tower_context().merge_call(f, v)
+
+
 @tf_export('metrics.auc')
 def auc(labels,
         predictions,
@@ -757,14 +805,18 @@ def auc(labels,
         raise ValueError('Invalid summation_method: %s' % summation_method)
 
     # sum up the areas of all the trapeziums
-    auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
-                            values['fp'], 'value')
+    def aggregate_auc(_, values):
+      auc_value = compute_auc(values['tp'], values['fn'], values['tn'],
+                              values['fp'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, auc_value)
+      return auc_value
+
+    auc_value = distribute_lib.get_tower_context().merge_call(
+        aggregate_auc, values)
     update_op = compute_auc(update_ops['tp'], update_ops['fn'],
                             update_ops['tn'], update_ops['fp'], 'update_op')
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, auc_value)
-
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -992,15 +1044,18 @@ def mean_per_class_accuracy(labels,
     update_total_op = state_ops.scatter_add(total, labels, ones)
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
-    per_class_accuracy = _safe_div(count, total, None)
+    def aggregate_mean_accuracy(_, count, total):
+      per_class_accuracy = _safe_div(count, total, None)
+      mean_accuracy_v = math_ops.reduce_mean(
+          per_class_accuracy, name='mean_accuracy')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_accuracy_v)
+      return mean_accuracy_v
 
-    mean_accuracy_v = math_ops.reduce_mean(
-        per_class_accuracy, name='mean_accuracy')
-    update_op = _safe_div(update_count_op, update_total_op, name='update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_accuracy_v)
+    mean_accuracy_v = distribute_lib.get_tower_context().merge_call(
+        aggregate_mean_accuracy, count, total)
 
+    update_op = _safe_div(update_count_op, update_total_op, name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -1071,7 +1126,7 @@ def mean_iou(labels,
     total_cm, update_op = _streaming_confusion_matrix(labels, predictions,
                                                       num_classes, weights)
 
-    def compute_mean_iou(name):
+    def compute_mean_iou(total_cm, name):
       """Compute the mean intersection-over-union via the confusion matrix."""
       sum_over_row = math_ops.to_float(math_ops.reduce_sum(total_cm, 0))
       sum_over_col = math_ops.to_float(math_ops.reduce_sum(total_cm, 1))
@@ -1098,10 +1153,14 @@ def mean_iou(labels,
           math_ops.reduce_sum(iou, name=name) / num_valid_entries, 0)
       return result
 
-    mean_iou_v = compute_mean_iou('mean_iou')
+    def mean_iou_across_towers(_, v):
+      mean_iou_v = compute_mean_iou(v, 'mean_iou')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_iou_v)
+      return mean_iou_v
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_iou_v)
+    mean_iou_v = distribute_lib.get_tower_context().merge_call(
+        mean_iou_across_towers, total_cm)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -1310,12 +1369,16 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    mean_t = _safe_div(total, count, 'value')
-    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
+    def aggregate_across_towers(_, t, c):
+      mean_t = _safe_div(t, c, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_t)
+      return mean_t
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_t)
+    mean_t = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, total, count)
 
+    update_op = _safe_div(update_total_op, update_count_op, 'update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -1413,12 +1476,9 @@ def _count_condition(values,
       weights = math_ops.to_float(weights)
       values = math_ops.multiply(values, weights)
 
-  value_tensor = array_ops.identity(count)
-  update_op = state_ops.assign_add(count, math_ops.reduce_sum(values))
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, value_tensor)
+  value_tensor = _aggregate_variable(count, metrics_collections)
 
+  update_op = state_ops.assign_add(count, math_ops.reduce_sum(values))
   if updates_collections:
     ops.add_to_collections(updates_collections, update_op)
 
@@ -1525,13 +1585,12 @@ def false_negatives_at_thresholds(labels,
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights=weights, includes=('fn',))
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['fn'])
+    fn_value = _aggregate_variable(values['fn'], metrics_collections)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_ops['fn'])
 
-    return values['fn'], update_ops['fn']
+    return fn_value, update_ops['fn']
 
 
 @tf_export('metrics.false_positives')
@@ -1635,13 +1694,12 @@ def false_positives_at_thresholds(labels,
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights=weights, includes=('fp',))
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['fp'])
+    fp_value = _aggregate_variable(values['fp'], metrics_collections)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_ops['fp'])
 
-    return values['fp'], update_ops['fp']
+    return fp_value, update_ops['fp']
 
 
 @tf_export('metrics.true_negatives')
@@ -1745,13 +1803,12 @@ def true_negatives_at_thresholds(labels,
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights=weights, includes=('tn',))
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['tn'])
+    tn_value = _aggregate_variable(values['tn'], metrics_collections)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_ops['tn'])
 
-    return values['tn'], update_ops['tn']
+    return tn_value, update_ops['tn']
 
 
 @tf_export('metrics.true_positives')
@@ -1855,13 +1912,12 @@ def true_positives_at_thresholds(labels,
     values, update_ops = _confusion_matrix_at_thresholds(
         labels, predictions, thresholds, weights=weights, includes=('tp',))
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, values['tp'])
+    tp_value = _aggregate_variable(values['tp'], metrics_collections)
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_ops['tp'])
 
-    return values['tp'], update_ops['tp']
+    return tp_value, update_ops['tp']
 
 
 @tf_export('metrics.precision')
@@ -1945,13 +2001,17 @@ def precision(labels,
       return array_ops.where(
           math_ops.greater(tp + fp, 0), math_ops.div(tp, tp + fp), 0, name)
 
-    p = compute_precision(true_p, false_p, 'value')
-    update_op = compute_precision(true_positives_update_op,
-                                  false_positives_update_op, 'update_op')
+    def once_across_towers(_, true_p, false_p):
+      p = compute_precision(true_p, false_p, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, p)
+      return p
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, p)
+    p = distribute_lib.get_tower_context().merge_call(
+        once_across_towers, true_p, false_p)
 
+    update_op = compute_precision(true_positives_update_op,
+                                  false_positives_update_op, 'update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -2025,13 +2085,17 @@ def precision_at_thresholds(labels,
     def compute_precision(tp, fp, name):
       return math_ops.div(tp, epsilon + tp + fp, name='precision_' + name)
 
-    prec = compute_precision(values['tp'], values['fp'], 'value')
-    update_op = compute_precision(update_ops['tp'], update_ops['fp'],
-                                  'update_op')
+    def precision_across_towers(_, values):
+      prec = compute_precision(values['tp'], values['fp'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, prec)
+      return prec
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, prec)
+    prec = distribute_lib.get_tower_context().merge_call(
+        precision_across_towers, values)
 
+    update_op = compute_precision(update_ops['tp'], update_ops['fp'],
+                                  'update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -2050,7 +2114,7 @@ def recall(labels,
   The `recall` function creates two local variables, `true_positives`
   and `false_negatives`, that are used to compute the recall. This value is
   ultimately returned as `recall`, an idempotent operation that simply divides
-  `true_positives` by the sum of `true_positives`  and `false_negatives`.
+  `true_positives` by the sum of `true_positives` and `false_negatives`.
 
   For estimation of the metric over a stream of data, the function creates an
   `update_op` that updates these variables and returns the `recall`. `update_op`
@@ -2117,13 +2181,17 @@ def recall(labels,
           math_ops.greater(true_p + false_n, 0),
           math_ops.div(true_p, true_p + false_n), 0, name)
 
-    rec = compute_recall(true_p, false_n, 'value')
-    update_op = compute_recall(true_positives_update_op,
-                               false_negatives_update_op, 'update_op')
+    def once_across_towers(_, true_p, false_n):
+      rec = compute_recall(true_p, false_n, 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, rec)
+      return rec
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, rec)
+    rec = distribute_lib.get_tower_context().merge_call(
+        once_across_towers, true_p, false_n)
 
+    update_op = compute_recall(true_positives_update_op,
+                               false_negatives_update_op, 'update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -2552,11 +2620,17 @@ def recall_at_top_k(labels,
         class_id=class_id,
         weights=weights)
 
-    metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
+    def aggregate_across_towers(_, tp, fn):
+      metric = math_ops.div(tp, math_ops.add(tp, fn), name=scope)
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, metric)
+      return metric
+
+    metric = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, tp, fn)
+
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fn_update), name='update')
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, metric)
     if updates_collections:
       ops.add_to_collections(updates_collections, update)
     return metric, update
@@ -2627,12 +2701,16 @@ def recall_at_thresholds(labels,
     def compute_recall(tp, fn, name):
       return math_ops.div(tp, epsilon + tp + fn, name='recall_' + name)
 
-    rec = compute_recall(values['tp'], values['fn'], 'value')
-    update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
+    def recall_across_towers(_, values):
+      rec = compute_recall(values['tp'], values['fn'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, rec)
+      return rec
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, rec)
+    rec = distribute_lib.get_tower_context().merge_call(
+        recall_across_towers, values)
 
+    update_op = compute_recall(update_ops['tp'], update_ops['fn'], 'update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -2698,13 +2776,16 @@ def root_mean_squared_error(labels,
   mse, update_mse_op = mean_squared_error(labels, predictions, weights, None,
                                           None, name or
                                           'root_mean_squared_error')
+  def once_across_towers(_, mse):
+    rmse = math_ops.sqrt(mse)
+    if metrics_collections:
+      ops.add_to_collections(metrics_collections, rmse)
+    return rmse
 
-  rmse = math_ops.sqrt(mse)
-  update_rmse_op = math_ops.sqrt(update_mse_op)
-
-  if metrics_collections:
-    ops.add_to_collections(metrics_collections, rmse)
+  rmse = distribute_lib.get_tower_context().merge_call(
+      once_across_towers, mse)
 
+  update_rmse_op = math_ops.sqrt(update_mse_op)
   if updates_collections:
     ops.add_to_collections(updates_collections, update_rmse_op)
 
@@ -2797,15 +2878,19 @@ def sensitivity_at_specificity(labels,
       return math_ops.div(tp[tf_index], tp[tf_index] + fn[tf_index] + kepsilon,
                           name)
 
-    sensitivity = compute_sensitivity_at_specificity(
-        values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+    def aggregate_across_towers(_, values):
+      sensitivity = compute_sensitivity_at_specificity(
+          values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, sensitivity)
+      return sensitivity
+
+    sensitivity = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, values)
+
     update_op = compute_sensitivity_at_specificity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
         'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, sensitivity)
-
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
@@ -3070,11 +3155,16 @@ def _streaming_sparse_average_precision_at_top_k(labels,
       total_update = state_ops.assign_add(total_var, batch_total, name='update')
 
     # Divide total by max to get mean, for both vars and the update ops.
-    mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
-    update = _safe_scalar_div(total_update, max_update, name=scope)
+    def aggregate_across_towers(_, total_var, max_var):
+      mean_average_precision = _safe_scalar_div(total_var, max_var, name='mean')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, mean_average_precision)
+      return mean_average_precision
 
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, mean_average_precision)
+    mean_average_precision = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, total_var, max_var)
+
+    update = _safe_scalar_div(total_update, max_update, name=scope)
     if updates_collections:
       ops.add_to_collections(updates_collections, update)
 
@@ -3351,11 +3441,17 @@ def precision_at_top_k(labels,
         class_id=class_id,
         weights=weights)
 
-    metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
+    def aggregate_across_towers(_, tp, fp):
+      metric = math_ops.div(tp, math_ops.add(tp, fp), name=scope)
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, metric)
+      return metric
+
+    metric = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, tp, fp)
+
     update = math_ops.div(
         tp_update, math_ops.add(tp_update, fp_update), name='update')
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, metric)
     if updates_collections:
       ops.add_to_collections(updates_collections, update)
     return metric, update
@@ -3583,15 +3679,19 @@ def specificity_at_sensitivity(labels,
       return math_ops.div(tn[tf_index], tn[tf_index] + fp[tf_index] + kepsilon,
                           name)
 
-    specificity = compute_specificity_at_sensitivity(
-        values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+    def aggregate_across_towers(_, values):
+      specificity = compute_specificity_at_sensitivity(
+          values['tp'], values['tn'], values['fp'], values['fn'], 'value')
+      if metrics_collections:
+        ops.add_to_collections(metrics_collections, specificity)
+      return specificity
+
+    specificity = distribute_lib.get_tower_context().merge_call(
+        aggregate_across_towers, values)
+
     update_op = compute_specificity_at_sensitivity(
         update_ops['tp'], update_ops['tn'], update_ops['fp'], update_ops['fn'],
         'update_op')
-
-    if metrics_collections:
-      ops.add_to_collections(metrics_collections, specificity)
-
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 7cd175f25b..29198e48fa 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -528,6 +528,8 @@ class DistributionStrategy(object):
   * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-tower
     context, like `d.update()` except with locality N.
   * `d.fetch(t)`: Copy `t` with any locality to the client's CPU device.
+    TODO(josh11b): Deprecate `fetch`, switch to `read_var` for
+    reading tower-local variables.
 
   The standard pattern for updating variables is to:
 
@@ -614,8 +616,8 @@ class DistributionStrategy(object):
 
     There will still be one component variable per tower, but there is
     no requirement that they stay in sync. Instead, when saving them
-    or calling `fetch()`, we use the value that results when calling
-    `reduce()` on all the towers' variables.
+    or calling `fetch()/read_var()`, we use the value that
+    results when calling `reduce()` on all the towers' variables.
 
     Note: tower-local implies not trainable. Instead, it is expected
     that each tower will directly update (using `assign_add()` or
@@ -646,6 +648,21 @@ class DistributionStrategy(object):
     _require_distribution_strategy_scope(self)
     return variable_scope.variable_creator_scope(create_tower_local_variable)
 
+  def read_var(self, v):
+    """Reads the value of a variable.
+
+    Returns the aggregate value of a tower-local variable, or the
+    (possibly read-only) value of any other variable.
+
+    Args:
+      v: A variable allocated within the scope of this `DistributionStrategy`.
+
+    Returns:
+      A tensor representing the value of `v`, aggregated across towers if
+      necessary.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
   def colocate_vars_with(self, colocate_with_variable):
     """Scope that controls which devices variables will be created on.
 
@@ -904,6 +921,8 @@ class DistributionStrategy(object):
     will attempt to avoid a copy by checking if the value is already
     on the destination device.
 
+    TODO(josh11b): Switch to `read_var`.
+
     Args:
       val: Value (which may be mirrored) to copy.
       destination: A device string to copy the value to.
@@ -1197,6 +1216,9 @@ class _DefaultDistributionStrategy(DistributionStrategy):
     with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
       return fn(*args, **kwargs)
 
+  def read_var(self, tower_local_var):
+    return tower_local_var
+
   def _fetch(self, var, destination, fn):
     with ops.colocate_with(var):
       var = fn(var)
-- 
GitLab


From 5fa7b03a255d3c0d05aa48e7604a94185ef6b9e2 Mon Sep 17 00:00:00 2001
From: Karl Lessard <karllessard@users.noreply.github.com>
Date: Tue, 12 Jun 2018 10:29:09 -0400
Subject: [PATCH 596/610] Replace @Generated annotation by notice (#19941)

---
 tensorflow/java/src/gen/cc/op_generator.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorflow/java/src/gen/cc/op_generator.cc b/tensorflow/java/src/gen/cc/op_generator.cc
index debd95fc62..9b171f66ec 100644
--- a/tensorflow/java/src/gen/cc/op_generator.cc
+++ b/tensorflow/java/src/gen/cc/op_generator.cc
@@ -376,9 +376,6 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
     }
   }
   // op annotations
-  op_class.add_annotation(
-      Annotation::Create("Generated", "javax.annotation")
-          .attributes("value = \"TensorFlow Java Op Generator\""));
   if (endpoint.deprecated()) {
     op_class.add_annotation(Annotation::Create("Deprecated"));
     string explanation;
@@ -415,8 +412,12 @@ void GenerateOp(const OpSpec& op, const EndpointSpec& endpoint,
   SourceFileWriter writer(op_file.get());
   std::list<Type> dependencies;
   CollectOpDependencies(op, mode, &dependencies);
-  writer.Write(kLicense).EndLine().BeginType(op_class, PUBLIC | FINAL,
-                                             &dependencies, &op_javadoc);
+  writer.Write(kLicense)
+      .EndLine()
+      .Write("// This class has been generated, DO NOT EDIT!")
+      .EndLine()
+      .EndLine()
+      .BeginType(op_class, PUBLIC | FINAL, &dependencies, &op_javadoc);
   if (!op.optional_attributes().empty()) {
     RenderOptionsClass(op, op_class, &writer);
   }
-- 
GitLab


From 15ee5980a5873fd4c975d835e813b9377cb79f7d Mon Sep 17 00:00:00 2001
From: Asim Shankar <ashankar@google.com>
Date: Tue, 12 Jun 2018 07:42:40 -0700
Subject: [PATCH 597/610] [Documentation]: Fix #19657

PiperOrigin-RevId: 200213440
---
 tensorflow/python/data/ops/dataset_ops.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 597f92048e..7c1e9dd754 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -223,6 +223,13 @@ class Dataset(object):
   def from_tensors(tensors):
     """Creates a `Dataset` with a single element, comprising the given tensors.
 
+    Note that if `tensors` contains a NumPy array, and eager execution is not
+    enabled, the values will be embedded in the graph as one or more
+    @{tf.constant} operations. For large datasets (> 1 GB), this can waste
+    memory and run into byte limits of graph serialization.  If tensors contains
+    one or more large NumPy arrays, consider the alternative described in
+    @{$programmers_guide/datasets#consuming_numpy_arrays$this guide}.
+
     Args:
       tensors: A nested structure of tensors.
 
@@ -235,6 +242,13 @@ class Dataset(object):
   def from_tensor_slices(tensors):
     """Creates a `Dataset` whose elements are slices of the given tensors.
 
+    Note that if `tensors` contains a NumPy array, and eager execution is not
+    enabled, the values will be embedded in the graph as one or more
+    @{tf.constant} operations. For large datasets (> 1 GB), this can waste
+    memory and run into byte limits of graph serialization.  If tensors contains
+    one or more large NumPy arrays, consider the alternative described in
+    @{$programmers_guide/datasets#consuming_numpy_arrays$this guide}.
+
     Args:
       tensors: A nested structure of tensors, each having the same size in the
         0th dimension.
-- 
GitLab


From e1dba885dd8640012ddb3d04bead1c20bcff62b3 Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 12 Jun 2018 08:30:35 -0700
Subject: [PATCH 598/610] Fix copts for stats_calculator.

PiperOrigin-RevId: 200219133
---
 tensorflow/core/BUILD | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index f17f39099a..6065ac53a0 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -876,6 +876,7 @@ cc_library(
     hdrs = [
         "util/stats_calculator.h",
     ],
+    copts = tf_copts(),
 )
 
 cc_library(
-- 
GitLab


From df1f2a0964faf66677c30cf56526b568d355597f Mon Sep 17 00:00:00 2001
From: Derek Murray <mrry@google.com>
Date: Tue, 12 Jun 2018 08:30:45 -0700
Subject: [PATCH 599/610] [tf.data] Remove obsolete StatsAggregator code from
 IteratorResource.

PiperOrigin-RevId: 200219155
---
 tensorflow/core/kernels/data/iterator_ops.cc | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index d71cac4ebc..f33e9cec29 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -207,12 +207,6 @@ class IteratorResource : public ResourceBase {
     return Status::OK();
   }
 
-
-  std::shared_ptr<StatsAggregator> stats_aggregator() {
-    tf_shared_lock l(mu_);
-    return stats_aggregator_;
-  }
-
   string DebugString() override { return "Iterator resource"; }
 
   const DataTypeVector& output_dtypes() const { return output_dtypes_; }
@@ -231,7 +225,6 @@ class IteratorResource : public ResourceBase {
   FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
   std::shared_ptr<IteratorBase> iterator_;
   mutex mu_;
-  std::shared_ptr<StatsAggregator> stats_aggregator_ GUARDED_BY(mu_);
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
@@ -944,9 +937,6 @@ class IteratorGetNextOp : public AsyncOpKernel {
 
           IteratorContext::Params params;
           params.env = ctx->env();
-          params.stats_aggregator_getter = [iterator]() {
-            return iterator->stats_aggregator();
-          };
           params.runner = *(ctx->runner());
           params.function_library = iterator->function_library();
           DeviceBase* device = ctx->function_library()->device();
@@ -995,9 +985,6 @@ class IteratorGetNextSyncOp : public OpKernel {
 
     IteratorContext::Params params;
     params.env = ctx->env();
-    params.stats_aggregator_getter = [iterator]() {
-      return iterator->stats_aggregator();
-    };
     params.runner = *(ctx->runner());
     params.function_library = iterator->function_library();
     DeviceBase* device = ctx->function_library()->device();
-- 
GitLab


From deb845fc79bcfe4d534a7050cc8e342f86db9dd0 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 08:42:53 -0700
Subject: [PATCH 600/610] Added optional argument to specify time step to
 contrib.integrate.odeint_fixed.

PiperOrigin-RevId: 200220800
---
 .../contrib/integrate/python/ops/odes.py      | 126 +++++++++++++++---
 .../contrib/integrate/python/ops/odes_test.py |  51 +++++--
 2 files changed, 147 insertions(+), 30 deletions(-)

diff --git a/tensorflow/contrib/integrate/python/ops/odes.py b/tensorflow/contrib/integrate/python/ops/odes.py
index b4a99867ed..61f78febfc 100644
--- a/tensorflow/contrib/integrate/python/ops/odes.py
+++ b/tensorflow/contrib/integrate/python/ops/odes.py
@@ -28,7 +28,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 
@@ -279,13 +278,27 @@ def _assert_increasing(t):
   return ops.control_dependencies([assert_increasing])
 
 
-def _check_input_types(t, y0):
+def _check_input_types(y0, t, dt=None):
   if not (y0.dtype.is_floating or y0.dtype.is_complex):
     raise TypeError('`y0` must have a floating point or complex floating '
                     'point dtype')
   if not t.dtype.is_floating:
     raise TypeError('`t` must have a floating point dtype')
 
+  if dt is not None and not dt.dtype.is_floating:
+    raise TypeError('`dt` must have a floating point dtype')
+
+
+def _check_input_sizes(t, dt):
+  if len(t.get_shape().as_list()) > 1:
+    raise ValueError('t must be a 1D tensor')
+
+  if len(dt.get_shape().as_list()) > 1:
+    raise ValueError('t must be a 1D tensor')
+
+  if t.get_shape()[0] != dt.get_shape()[0] + 1:
+    raise ValueError('t and dt have incompatible lengths, must be N and N-1')
+
 
 def _dopri5(func,
             y0,
@@ -510,7 +523,7 @@ def odeint(func,
     # avoiding the need to pack/unpack in user functions.
     y0 = ops.convert_to_tensor(y0, name='y0')
     t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
-    _check_input_types(t, y0)
+    _check_input_types(y0, t)
 
     error_dtype = abs(y0).dtype
     rtol = ops.convert_to_tensor(rtol, dtype=error_dtype, name='rtol')
@@ -530,24 +543,74 @@ def odeint(func,
 class _FixedGridIntegrator(six.with_metaclass(abc.ABCMeta)):
   """Base class for fixed-grid ODE integrators."""
 
-  def integrate(self, evol_func, y0, time_grid):
-    time_delta_grid = time_grid[1:] - time_grid[:-1]
-
-    scan_func = self._make_scan_func(evol_func)
+  def integrate(self, evol_func, y0, time_grid, dt_grid, steps_on_intervals):
+    """Returns integrated values of differential equation on the `time grid`.
+
+    Numerically integrates differential equation defined via time derivative
+    evaluator `evol_func` using fixed time steps specified in dt_grid.
+
+    Args:
+      evol_func: Callable, evaluates time derivative of y at a given time.
+      y0: N-D Tensor holds initial values of the solution.
+      time_grid: 1-D Tensor holding the time points at which the solution
+        will be recorded, must have a floating dtype.
+      dt_grid: 1-D Tensor holds fixed time steps to be used on time_grid
+        intervals. Must be a floating dtype and have one less element than that
+        of the time_grid.
+      steps_on_intervals: 1-D Tensor of integer dtype, must have the same size
+        as dt_grid. Specifies number of steps needed for every interval. Assumes
+        steps_on_intervals * dt_grid == time intervals.
+
+    Returns:
+      (N+1)-D tensor, where the first dimension corresponds to different
+      time points. Contains the solved value of y for each desired time point in
+      `t`, with the initial value `y0` being the first element along the first
+      dimension.
+    """
 
-    y_grid = functional_ops.scan(scan_func, (time_grid[:-1], time_delta_grid),
-                                 y0)
-    return array_ops.concat([[y0], y_grid], axis=0)
+    iteration_func = self._make_iteration_func(evol_func, dt_grid)
+    integrate_interval = self._make_interval_integrator(iteration_func,
+                                                        steps_on_intervals)
 
-  def _make_scan_func(self, evol_func):
+    num_times = array_ops.size(time_grid)
+    current_time = time_grid[0]
+    solution_array = tensor_array_ops.TensorArray(y0.dtype, num_times)
+    solution_array = solution_array.write(0, y0)
 
-    def scan_func(y, t_and_dt):
-      t, dt = t_and_dt
+    solution_array, _, _, _ = control_flow_ops.while_loop(
+        lambda _, __, ___, i: i < num_times,
+        integrate_interval,
+        (solution_array, y0, current_time, 1)
+    )
+    solution_array = solution_array.stack()
+    solution_array.set_shape(time_grid.get_shape().concatenate(y0.get_shape()))
+    return solution_array
+
+  def _make_iteration_func(self, evol_func, dt_grid):
+    """Returns a function that builds operations of a single time step."""
+
+    def iteration_func(y, t, dt_step, interval_step):
+      """Performs a single time step advance."""
+      dt = dt_grid[interval_step - 1]
       dy = self._step_func(evol_func, t, dt, y)
       dy = math_ops.cast(dy, dtype=y.dtype)
-      return y + dy
+      return y + dy, t + dt, dt_step + 1, interval_step
+
+    return iteration_func
+
+  def _make_interval_integrator(self, iteration_func, interval_sizes):
+    """Returns a function that builds operations for interval integration."""
 
-    return scan_func
+    def integrate_interval(solution_array, y, t, interval_num):
+      """Integrates y with fixed time step on interval `interval_num`."""
+      y, t, _, _ = control_flow_ops.while_loop(
+          lambda _, __, j, interval_num: j < interval_sizes[interval_num - 1],
+          iteration_func,
+          (y, t, 0, interval_num)
+      )
+      return solution_array.write(interval_num, y), y, t, interval_num + 1
+
+    return integrate_interval
 
   @abc.abstractmethod
   def _step_func(self, evol_func, t, dt, y):
@@ -555,6 +618,7 @@ class _FixedGridIntegrator(six.with_metaclass(abc.ABCMeta)):
 
 
 class _MidpointFixedGridIntegrator(_FixedGridIntegrator):
+  """Fixed grid integrator implementing midpoint scheme."""
 
   def _step_func(self, evol_func, t, dt, y):
     dt_cast = math_ops.cast(dt, y.dtype)
@@ -563,6 +627,7 @@ class _MidpointFixedGridIntegrator(_FixedGridIntegrator):
 
 
 class _RK4FixedGridIntegrator(_FixedGridIntegrator):
+  """Fixed grid integrator implementing RK4 scheme."""
 
   def _step_func(self, evol_func, t, dt, y):
     k1 = evol_func(y, t)
@@ -575,7 +640,7 @@ class _RK4FixedGridIntegrator(_FixedGridIntegrator):
     return math_ops.add_n([k1, 2 * k2, 2 * k3, k4]) * (dt_cast / 6)
 
 
-def odeint_fixed(func, y0, t, method='rk4', name=None):
+def odeint_fixed(func, y0, t, dt=None, method='rk4', name=None):
   """ODE integration on a fixed grid (with no step size control).
 
   Useful in certain scenarios to avoid the overhead of adaptive step size
@@ -590,6 +655,14 @@ def odeint_fixed(func, y0, t, method='rk4', name=None):
       `y`. The initial time point should be the first element of this sequence,
       and each time must be larger than the previous time. May have any floating
       point dtype.
+    dt: 0-D or 1-D Tensor providing time step suggestion to be used on time
+      integration intervals in `t`. 1-D Tensor should provide values
+      for all intervals, must have 1 less element than that of `t`.
+      If given a 0-D Tensor, the value is interpreted as time step suggestion
+      same for all intervals. If passed None, then time step is set to be the
+      t[1:] - t[:-1]. Defaults to None. The actual step size is obtained by
+      insuring an integer number of steps per interval, potentially reducing the
+      time step.
     method: One of 'midpoint' or 'rk4'.
     name: Optional name for the resulting operation.
 
@@ -602,16 +675,29 @@ def odeint_fixed(func, y0, t, method='rk4', name=None):
   Raises:
     ValueError: Upon caller errors.
   """
-  with ops.name_scope(name, 'odeint_fixed', [y0, t]):
+  with ops.name_scope(name, 'odeint_fixed', [y0, t, dt]):
     t = ops.convert_to_tensor(t, preferred_dtype=dtypes.float64, name='t')
     y0 = ops.convert_to_tensor(y0, name='y0')
-    _check_input_types(t, y0)
+
+    intervals = t[1:] - t[:-1]
+    if dt is None:
+      dt = intervals
+    dt = ops.convert_to_tensor(dt, preferred_dtype=dtypes.float64, name='dt')
+
+    steps_on_intervals = math_ops.ceil(intervals / dt)
+    dt = intervals / steps_on_intervals
+    steps_on_intervals = math_ops.cast(steps_on_intervals, dtype=dtypes.int32)
+
+    _check_input_types(y0, t, dt)
+    _check_input_sizes(t, dt)
 
     with _assert_increasing(t):
       with ops.name_scope(method):
         if method == 'midpoint':
-          return _MidpointFixedGridIntegrator().integrate(func, y0, t)
+          return _MidpointFixedGridIntegrator().integrate(func, y0, t, dt,
+                                                          steps_on_intervals)
         elif method == 'rk4':
-          return _RK4FixedGridIntegrator().integrate(func, y0, t)
+          return _RK4FixedGridIntegrator().integrate(func, y0, t, dt,
+                                                     steps_on_intervals)
         else:
           raise ValueError('method not supported: {!s}'.format(method))
diff --git a/tensorflow/contrib/integrate/python/ops/odes_test.py b/tensorflow/contrib/integrate/python/ops/odes_test.py
index 3ec01212d2..c7b4e2faa8 100644
--- a/tensorflow/contrib/integrate/python/ops/odes_test.py
+++ b/tensorflow/contrib/integrate/python/ops/odes_test.py
@@ -242,40 +242,56 @@ class InterpolationTest(test.TestCase):
 
 class OdeIntFixedTest(test.TestCase):
 
-  def _test_integrate_sine(self, method):
+  def _test_integrate_sine(self, method, t, dt=None):
 
     def evol_func(y, t):
       del t
       return array_ops.stack([y[1], -y[0]])
 
     y0 = [0., 1.]
-    time_grid = np.linspace(0., 10., 200)
-    y_grid = odes.odeint_fixed(evol_func, y0, time_grid, method=method)
+    y_grid = odes.odeint_fixed(evol_func, y0, t, dt, method=method)
 
     with self.test_session() as sess:
       y_grid_array = sess.run(y_grid)
 
     np.testing.assert_allclose(
-        y_grid_array[:, 0], np.sin(time_grid), rtol=1e-2, atol=1e-2)
+        y_grid_array[:, 0], np.sin(t), rtol=1e-2, atol=1e-2)
 
-  def _test_integrate_gaussian(self, method):
+  def _test_integrate_gaussian(self, method, t, dt=None):
 
     def evol_func(y, t):
       return -math_ops.cast(t, dtype=y.dtype) * y[0]
 
     y0 = [1.]
-    time_grid = np.linspace(0., 2., 100)
-    y_grid = odes.odeint_fixed(evol_func, y0, time_grid, method=method)
+    y_grid = odes.odeint_fixed(evol_func, y0, t, dt, method=method)
 
     with self.test_session() as sess:
       y_grid_array = sess.run(y_grid)
 
     np.testing.assert_allclose(
-        y_grid_array[:, 0], np.exp(-time_grid**2 / 2), rtol=1e-2, atol=1e-2)
+        y_grid_array[:, 0], np.exp(-t**2 / 2), rtol=1e-2, atol=1e-2)
+
+  def _test_integrate_sine_all(self, method):
+    uniform_time_grid = np.linspace(0., 10., 200)
+    non_uniform_time_grid = np.asarray([0.0, 0.4, 4.7, 5.2, 7.0])
+    uniform_dt = 0.02
+    non_uniform_dt = np.asarray([0.01, 0.001, 0.05, 0.03])
+    self._test_integrate_sine(method, uniform_time_grid)
+    self._test_integrate_sine(method, non_uniform_time_grid, uniform_dt)
+    self._test_integrate_sine(method, non_uniform_time_grid, non_uniform_dt)
+
+  def _test_integrate_gaussian_all(self, method):
+    uniform_time_grid = np.linspace(0., 2., 100)
+    non_uniform_time_grid = np.asarray([0.0, 0.1, 0.7, 1.2, 2.0])
+    uniform_dt = 0.01
+    non_uniform_dt = np.asarray([0.01, 0.001, 0.1, 0.03])
+    self._test_integrate_gaussian(method, uniform_time_grid)
+    self._test_integrate_gaussian(method, non_uniform_time_grid, uniform_dt)
+    self._test_integrate_gaussian(method, non_uniform_time_grid, non_uniform_dt)
 
   def _test_everything(self, method):
-    self._test_integrate_sine(method)
-    self._test_integrate_gaussian(method)
+    self._test_integrate_sine_all(method)
+    self._test_integrate_gaussian_all(method)
 
   def test_midpoint(self):
     self._test_everything('midpoint')
@@ -283,6 +299,21 @@ class OdeIntFixedTest(test.TestCase):
   def test_rk4(self):
     self._test_everything('rk4')
 
+  def test_dt_size_exceptions(self):
+    times = np.linspace(0., 2., 100)
+    dt = np.ones(99) * 0.01
+    dt_wrong_length = np.asarray([0.01, 0.001, 0.1, 0.03])
+    dt_wrong_dim = np.expand_dims(np.linspace(0., 2., 99), axis=0)
+    times_wrong_dim = np.expand_dims(np.linspace(0., 2., 100), axis=0)
+    with self.assertRaises(ValueError):
+      self._test_integrate_gaussian('midpoint', times, dt_wrong_length)
+
+    with self.assertRaises(ValueError):
+      self._test_integrate_gaussian('midpoint', times, dt_wrong_dim)
+
+    with self.assertRaises(ValueError):
+      self._test_integrate_gaussian('midpoint', times_wrong_dim, dt)
+
 
 if __name__ == '__main__':
   test.main()
-- 
GitLab


From 90f6bd2d962ade377a5b92c7d1c0e1faa78288e0 Mon Sep 17 00:00:00 2001
From: Nupur Garg <nupurgarg@google.com>
Date: Tue, 12 Jun 2018 09:38:40 -0700
Subject: [PATCH 601/610] Add strings type to TOCO Python API.

PiperOrigin-RevId: 200228895
---
 tensorflow/contrib/lite/python/convert.py        | 7 ++++---
 tensorflow/contrib/lite/python/lite.py           | 5 +++--
 tensorflow/contrib/lite/python/tflite_convert.py | 4 ++--
 tensorflow/contrib/lite/toco/tooling_util.cc     | 2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/tensorflow/contrib/lite/python/convert.py b/tensorflow/contrib/lite/python/convert.py
index c038c88945..df39d7ff50 100644
--- a/tensorflow/contrib/lite/python/convert.py
+++ b/tensorflow/contrib/lite/python/convert.py
@@ -136,10 +136,10 @@ def build_toco_convert_protos(input_tensors,
       `foo.get_shape()` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
     inference_type: Target data type of arrays in the output file. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`.  (default FLOAT)
     inference_input_type: Target data type of input arrays. Allows for a
       different type for input arrays in the case of quantization. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`. (default `inference_type`)
     input_format: Type of data to read Currently must be
       `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
     output_format: Output file format. Currently must be `{TFLITE,
@@ -213,7 +213,8 @@ def build_toco_convert_protos(input_tensors,
       tflite_input_type = lite_constants.INT64
     elif input_tensor.dtype == _dtypes.uint8:
       tflite_input_type = lite_constants.QUANTIZED_UINT8
-    # TODO(aselle): Insert strings when they are available
+    elif input_tensor.dtype == _dtypes.string:
+      tflite_input_type = lite_constants.STRING
     else:
       raise ValueError("Tensors %s not known type %r" % (input_tensor.name,
                                                          input_tensor.dtype))
diff --git a/tensorflow/contrib/lite/python/lite.py b/tensorflow/contrib/lite/python/lite.py
index 6b63c0ccef..611e0f91d0 100644
--- a/tensorflow/contrib/lite/python/lite.py
+++ b/tensorflow/contrib/lite/python/lite.py
@@ -25,6 +25,7 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 
 @@FLOAT
 @@QUANTIZED_UINT8
+@@STRING
 @@TFLITE
 @@GRAPHVIZ_DOT
 
@@ -64,10 +65,10 @@ class TocoConverter(object):
   Attributes:
 
     inference_type: Target data type of arrays in the output file. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`.  (default FLOAT)
     inference_input_type: Target data type of input arrays. Allows for a
       different type for input arrays in the case of quantization. Currently
-      must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      must be `{FLOAT, QUANTIZED_UINT8, STRING}`. (default `inference_type`)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: Dict of strings representing input tensor names
diff --git a/tensorflow/contrib/lite/python/tflite_convert.py b/tensorflow/contrib/lite/python/tflite_convert.py
index f497533bed..7bbfe2a601 100644
--- a/tensorflow/contrib/lite/python/tflite_convert.py
+++ b/tensorflow/contrib/lite/python/tflite_convert.py
@@ -234,12 +234,12 @@ def run_main(_):
   parser.add_argument(
       "--inference_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "STRING"],
       help="Target data type of arrays in the output file.")
   parser.add_argument(
       "--inference_input_type",
       type=str.upper,
-      choices=["FLOAT", "QUANTIZED_UINT8"],
+      choices=["FLOAT", "QUANTIZED_UINT8", "STRING"],
       help=("Target data type of input arrays. Allows for a different type for "
             "input arrays in the case of quantization."))
 
diff --git a/tensorflow/contrib/lite/toco/tooling_util.cc b/tensorflow/contrib/lite/toco/tooling_util.cc
index 810718f610..13e9331919 100644
--- a/tensorflow/contrib/lite/toco/tooling_util.cc
+++ b/tensorflow/contrib/lite/toco/tooling_util.cc
@@ -920,7 +920,7 @@ void CheckEachArray(const Model& model) {
       CHECK(array->buffer->type == array->data_type);
       // The presence of a fixed buffer should imply the presence of a fixed
       // shape.
-      CHECK(array->has_shape());
+      CHECK(array->has_shape()) << "Invalid array: " << array_entry.first;
       // Constant buffer should has a valid shape.
       for (int d : array->shape().dims()) {
         CHECK_GE(d, 1);
-- 
GitLab


From 73a8f96660587747956432941be17ea2dfe6dd33 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 09:43:30 -0700
Subject: [PATCH 602/610] Small utility to handle runtime shapes.

PiperOrigin-RevId: 200229761
---
 tensorflow/contrib/lite/kernels/internal/types.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tensorflow/contrib/lite/kernels/internal/types.h b/tensorflow/contrib/lite/kernels/internal/types.h
index 1086c5b092..3ecef15271 100644
--- a/tensorflow/contrib/lite/kernels/internal/types.h
+++ b/tensorflow/contrib/lite/kernels/internal/types.h
@@ -121,6 +121,10 @@ class RuntimeShape {
     }
   }
 
+  inline void BuildFrom(const std::initializer_list<int> init_list) {
+    BuildFrom<const std::initializer_list<int>>(init_list);
+  }
+
   // Returns the total count of elements, that is the size when flattened into a
   // vector.
   inline int FlatSize() const {
-- 
GitLab


From 3b4f4164663da4c65807c34e7188e43c9d7d7535 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 09:50:22 -0700
Subject: [PATCH 603/610] Random jpeg encoding augmentation.

PiperOrigin-RevId: 200231310
---
 tensorflow/python/ops/image_ops_impl.py       | 69 +++++++++++++++++++
 .../tools/api/golden/tensorflow.image.pbtxt   |  8 +++
 2 files changed, 77 insertions(+)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 16aa85ca10..c2179023cd 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -1451,6 +1451,75 @@ def adjust_hue(image, delta, name=None):
     return convert_image_dtype(rgb_altered, orig_dtype)
 
 
+# pylint: disable=invalid-name
+@tf_export('image.random_jpeg_quality')
+def random_jpeg_quality(image, min_jpeg_quality, max_jpeg_quality, seed=None):
+  """Randomly changes jpeg encoding quality for inducing jpeg noise.
+
+  `min_jpeg_quality` must be in the interval `[0, 100]` and less than
+  `max_jpeg_quality`.
+  `max_jpeg_quality` must be in the interval `[0, 100]`.
+
+  Args:
+    image: RGB image or images. Size of the last dimension must be 3.
+    min_jpeg_quality: Minimum jpeg encoding quality to use.
+    max_jpeg_quality: Maximum jpeg encoding quality to use.
+    seed: An operation-specific seed. It will be used in conjunction
+      with the graph-level seed to determine the real seeds that will be
+      used in this operation. Please see the documentation of
+      set_random_seed for its interaction with the graph-level random seed.
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+
+  Raises:
+    ValueError: if `min_jpeg_quality` or `max_jpeg_quality` is invalid.
+  """
+  if (min_jpeg_quality < 0 or max_jpeg_quality < 0 or
+      min_jpeg_quality > 100 or max_jpeg_quality > 100):
+    raise ValueError('jpeg encoding range must be between 0 and 100.')
+
+  if min_jpeg_quality >= max_jpeg_quality:
+    raise ValueError('`min_jpeg_quality` must be less than `max_jpeg_quality`.')
+
+  np.random.seed(seed)
+  jpeg_quality = np.random.randint(min_jpeg_quality, max_jpeg_quality)
+  return adjust_jpeg_quality(image, jpeg_quality)
+
+
+@tf_export('image.adjust_jpeg_quality')
+def adjust_jpeg_quality(image, jpeg_quality, name=None):
+  """Adjust jpeg encoding quality of an RGB image.
+
+  This is a convenience method that adjusts jpeg encoding quality of an
+  RGB image.
+
+  `image` is an RGB image.  The image's encoding quality is adjusted
+  to `jpeg_quality`.
+  `jpeg_quality` must be in the interval `[0, 100]`.
+
+  Args:
+    image: RGB image or images. Size of the last dimension must be 3.
+    jpeg_quality: int.  jpeg encoding quality.
+    name: A name for this operation (optional).
+
+  Returns:
+    Adjusted image(s), same shape and DType as `image`.
+  """
+  with ops.name_scope(name, 'adjust_jpeg_quality', [image]) as name:
+    image = ops.convert_to_tensor(image, name='image')
+    # Remember original dtype to so we can convert back if needed
+    orig_dtype = image.dtype
+    # Convert to uint8
+    image = convert_image_dtype(image, dtypes.uint8)
+    # Encode image to jpeg with given jpeg quality
+    image = gen_image_ops.encode_jpeg(image, quality=jpeg_quality)
+    # Decode jpeg image
+    image = gen_image_ops.decode_jpeg(image)
+    # Convert back to original dtype and return
+    return convert_image_dtype(image, orig_dtype)
+
+
 @tf_export('image.random_saturation')
 def random_saturation(image, lower, upper, seed=None):
   """Adjust the saturation of an RGB image by a random factor.
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index 87543e374b..a5b82f4bf8 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -20,6 +20,10 @@ tf_module {
     name: "adjust_hue"
     argspec: "args=[\'image\', \'delta\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "adjust_jpeg_quality"
+    argspec: "args=[\'image\', \'jpeg_quality\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "adjust_saturation"
     argspec: "args=[\'image\', \'saturation_factor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -144,6 +148,10 @@ tf_module {
     name: "random_hue"
     argspec: "args=[\'image\', \'max_delta\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "random_jpeg_quality"
+    argspec: "args=[\'image\', \'min_jpeg_quality\', \'max_jpeg_quality\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "random_saturation"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
-- 
GitLab


From b0a15f21d2009ead9c8ed5e245a02b5c42355853 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 09:51:04 -0700
Subject: [PATCH 604/610] Make the return value of `read_var` consistently a
 tensor instead of sometimes a variable.

PiperOrigin-RevId: 200231463
---
 tensorflow/contrib/distribute/python/mirrored_strategy.py   | 2 +-
 tensorflow/contrib/distribute/python/one_device_strategy.py | 2 +-
 tensorflow/python/training/distribute.py                    | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index 403e47d94f..900aa10e93 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -349,7 +349,7 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
     if isinstance(tower_local_var, values.TowerLocalVariable):
       return math_ops.add_n(self.unwrap(tower_local_var))
     assert isinstance(tower_local_var, values.Mirrored)
-    return tower_local_var.get()
+    return array_ops.identity(tower_local_var.get())
 
   def _fetch(self, val, destination, fn):
     """Return a copy of `val` or `fn(val)` on `destination`."""
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 6378af32bd..7f4bab9d93 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -104,7 +104,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
 
   def read_var(self, tower_local_var):
     """Read the aggregate value of a tower-local variable."""
-    return tower_local_var
+    return array_ops.identity(tower_local_var)
 
   def _fetch(self, val, destination, fn):
     """Return a copy of `val` or `fn(val)` on `destination`."""
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 29198e48fa..caffd042a0 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -652,7 +652,7 @@ class DistributionStrategy(object):
     """Reads the value of a variable.
 
     Returns the aggregate value of a tower-local variable, or the
-    (possibly read-only) value of any other variable.
+    (read-only) value of any other variable.
 
     Args:
       v: A variable allocated within the scope of this `DistributionStrategy`.
@@ -1217,7 +1217,7 @@ class _DefaultDistributionStrategy(DistributionStrategy):
       return fn(*args, **kwargs)
 
   def read_var(self, tower_local_var):
-    return tower_local_var
+    return array_ops.identity(tower_local_var)
 
   def _fetch(self, var, destination, fn):
     with ops.colocate_with(var):
-- 
GitLab


From d820151d5719532155b8637ec7baa75ff4c7ebbd Mon Sep 17 00:00:00 2001
From: Shashi Shekhar <shashishekhar@google.com>
Date: Tue, 12 Jun 2018 10:47:09 -0700
Subject: [PATCH 605/610] Fix a few copts.

PiperOrigin-RevId: 200241859
---
 tensorflow/contrib/lite/profiling/BUILD       |  7 ++++++-
 tensorflow/contrib/lite/tools/benchmark/BUILD | 13 +++++--------
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/lite/profiling/BUILD b/tensorflow/contrib/lite/profiling/BUILD
index c31189f2b1..a162b87b8f 100644
--- a/tensorflow/contrib/lite/profiling/BUILD
+++ b/tensorflow/contrib/lite/profiling/BUILD
@@ -2,9 +2,11 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
+load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
+
 common_copts = [
     "-Wall",
-]
+] + tflite_copts()
 
 cc_library(
     name = "profiler",
@@ -36,12 +38,14 @@ cc_library(
     name = "time",
     srcs = ["time.cc"],
     hdrs = ["time.h"],
+    copts = common_copts,
 )
 
 cc_library(
     name = "profile_summarizer",
     srcs = ["profile_summarizer.cc"],
     hdrs = ["profile_summarizer.h"],
+    copts = common_copts,
     deps = [
         ":profiler",
         "//tensorflow/contrib/lite:framework",
@@ -53,6 +57,7 @@ cc_library(
 cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
+    copts = common_copts,
     deps = [
         ":profile_summarizer",
         "//tensorflow/contrib/lite:framework",
diff --git a/tensorflow/contrib/lite/tools/benchmark/BUILD b/tensorflow/contrib/lite/tools/benchmark/BUILD
index f918010e2b..96c6b6872e 100644
--- a/tensorflow/contrib/lite/tools/benchmark/BUILD
+++ b/tensorflow/contrib/lite/tools/benchmark/BUILD
@@ -8,7 +8,7 @@ load("//tensorflow/contrib/lite:special_rules.bzl", "tflite_portable_test_suite"
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_linkopts")
 load("//tensorflow/contrib/lite:build_def.bzl", "tflite_copts")
 
-common_copts = ["-Wall"]
+common_copts = ["-Wall"] + tflite_copts()
 
 cc_binary(
     name = "benchmark_model",
@@ -16,14 +16,11 @@ cc_binary(
         "benchmark_main.cc",
         "logging.h",
     ],
-    copts = tflite_copts() + common_copts,
-    linkopts = select({
+    copts = common_copts,
+    linkopts = tflite_linkopts() + select({
         "//tensorflow:android": [
-            "-pie",
-            "-landroid",
-            "-lm",
-            "-z defs",
-            "-Wl,--exclude-libs,ALL",  # Exclude syms in all libs from auto export
+            "-pie",  # Android 5.0 and later supports only PIE
+            "-lm",  # some builtin ops, e.g., tanh, need -lm
         ],
         "//conditions:default": [],
     }),
-- 
GitLab


From ffe3d1b4dba7c39a291861e75060a871caab92c3 Mon Sep 17 00:00:00 2001
From: Frank Chen <frankchn@google.com>
Date: Tue, 12 Jun 2018 10:51:57 -0700
Subject: [PATCH 606/610] Add resize_images_preserve_aspect_ratio function.

PiperOrigin-RevId: 200242751
---
 tensorflow/python/ops/image_ops_impl.py       | 29 ++++++-
 tensorflow/python/ops/image_ops_test.py       | 80 +++++++++++++++++++
 .../tools/api/golden/tensorflow.image.pbtxt   |  2 +-
 3 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index c2179023cd..bdcf420980 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -921,7 +921,8 @@ class ResizeMethod(object):
 def resize_images(images,
                   size,
                   method=ResizeMethod.BILINEAR,
-                  align_corners=False):
+                  align_corners=False,
+                  preserve_aspect_ratio=False):
   """Resize `images` to `size` using the specified `method`.
 
   Resized images will be distorted if their original aspect ratio is not
@@ -953,6 +954,10 @@ def resize_images(images,
     align_corners: bool.  If True, the centers of the 4 corner pixels of the
         input and output tensors are aligned, preserving the values at the
         corner pixels. Defaults to `False`.
+    preserve_aspect_ratio: Whether to preserve the aspect ratio. If this is set,
+      then `images` will be resized to a size that fits in `size` while
+      preserving the aspect ratio of the original image. Scales up the image if
+      `size` is bigger than the current size of the `image`. Defaults to False.
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -991,6 +996,28 @@ def resize_images(images,
     new_height_const = size_const_as_shape[0].value
     new_width_const = size_const_as_shape[1].value
 
+    if preserve_aspect_ratio:
+      # Get the current shapes of the image, even if dynamic.
+      _, current_height, current_width, _ = _ImageDimensions(images, rank=4)
+
+      # do the computation to find the right scale and height/width.
+      scale_factor_height = (math_ops.to_float(new_height_const) /
+                             math_ops.to_float(current_height))
+      scale_factor_width = (math_ops.to_float(new_width_const) /
+                            math_ops.to_float(current_width))
+      scale_factor = math_ops.minimum(scale_factor_height, scale_factor_width)
+      scaled_height_const = math_ops.to_int32(scale_factor *
+                                              math_ops.to_float(current_height))
+      scaled_width_const = math_ops.to_int32(scale_factor *
+                                             math_ops.to_float(current_width))
+
+      # NOTE: Reset the size and other constants used later.
+      size = ops.convert_to_tensor([scaled_height_const, scaled_width_const],
+                                   dtypes.int32, name='size')
+      size_const_as_shape = tensor_util.constant_value_as_shape(size)
+      new_height_const = size_const_as_shape[0].value
+      new_width_const = size_const_as_shape[1].value
+
     # If we can determine that the height and width will be unmodified by this
     # transformation, we avoid performing the resize.
     if all(x is not None
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index 72c889a2e6..45499dcce0 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -2511,6 +2511,86 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       y = image_ops.resize_images(single_image, [55, 66])
       self.assertTrue(y.op.name.startswith("resize_images"))
 
+  def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
+                       use_tensor_inputs):
+    if use_tensor_inputs:
+      target_max = ops.convert_to_tensor([max_h, max_w])
+      x_tensor = array_ops.placeholder(x.dtype, shape=[None] * x.ndim)
+      feed_dict = {x_tensor: x}
+    else:
+      target_max = [max_h, max_w]
+      x_tensor = x
+      feed_dict = {}
+
+    y = image_ops.resize_images(x_tensor, target_max,
+                                preserve_aspect_ratio=preserve_aspect_ratio)
+
+    with self.test_session(use_gpu=True):
+      return y.eval(feed_dict=feed_dict)
+
+  def _assertResizeEqual(self, x, x_shape, y, y_shape,
+                         preserve_aspect_ratio=True,
+                         use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width, _ = y_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.array(y).reshape(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageCall(x, target_height, target_width,
+                                   preserve_aspect_ratio, use_tensor_inputs)
+      self.assertAllClose(y, y_tf)
+
+  def _assertResizeCheckShape(self, x, x_shape, target_shape,
+                              y_shape, preserve_aspect_ratio=True,
+                              use_tensor_inputs_options=None):
+    use_tensor_inputs_options = use_tensor_inputs_options or [False, True]
+    target_height, target_width = target_shape
+    x = np.array(x).reshape(x_shape)
+    y = np.zeros(y_shape)
+
+    for use_tensor_inputs in use_tensor_inputs_options:
+      y_tf = self._ResizeImageCall(x, target_height, target_width,
+                                   preserve_aspect_ratio, use_tensor_inputs)
+      self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
+
+  def testPreserveAspectRatioMultipleImages(self):
+    x_shape = [10, 100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
+                                 preserve_aspect_ratio=False)
+
+  def testPreserveAspectRatioNoOp(self):
+    x_shape = [10, 10, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeEqual(x, x_shape, x, x_shape)
+
+  def testPreserveAspectRatioSmaller(self):
+    x_shape = [100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
+
+  def testPreserveAspectRatioSmallerMultipleImages(self):
+    x_shape = [10, 100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
+
+  def testPreserveAspectRatioLarger(self):
+    x_shape = [100, 100, 10]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
+
+  def testPreserveAspectRatioSameRatio(self):
+    x_shape = [1920, 1080, 3]
+    x = np.random.uniform(size=x_shape)
+
+    self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
+
 
 class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/tools/api/golden/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
index a5b82f4bf8..5bb3b3c444 100644
--- a/tensorflow/tools/api/golden/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.image.pbtxt
@@ -174,7 +174,7 @@ tf_module {
   }
   member_method {
     name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\'], varargs=None, keywords=None, defaults=[\'0\', \'False\'], "
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
   }
   member_method {
     name: "resize_nearest_neighbor"
-- 
GitLab


From ba9422a8adba18fc97cc1923002b7db8ca63dcfe Mon Sep 17 00:00:00 2001
From: Brennan Saeta <saeta@google.com>
Date: Tue, 12 Jun 2018 11:12:53 -0700
Subject: [PATCH 607/610] Switch from grpc++_unsecure to grpc++

Fixes #13590

PiperOrigin-RevId: 200246854
---
 tensorflow/compiler/xla/rpc/BUILD             |  6 +--
 tensorflow/contrib/cmake/CMakeLists.txt       |  9 +++++
 tensorflow/contrib/cmake/external/grpc.cmake  | 17 ++++++---
 tensorflow/contrib/tpu/profiler/BUILD         |  2 +-
 tensorflow/contrib/verbs/BUILD                |  4 +-
 tensorflow/core/debug/BUILD                   |  6 +--
 tensorflow/core/distributed_runtime/BUILD     |  4 +-
 .../core/distributed_runtime/eager/BUILD      |  4 +-
 tensorflow/core/distributed_runtime/rpc/BUILD | 38 +++++++++----------
 .../core/distributed_runtime/rpc/eager/BUILD  |  6 +--
 tensorflow/workspace.bzl                      |  4 +-
 11 files changed, 57 insertions(+), 43 deletions(-)

diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 0d56a9a477..1775666652 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -42,7 +42,7 @@ tf_cc_binary(
         "//tensorflow/compiler/xla/service:cpu_plugin",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -61,7 +61,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -74,6 +74,6 @@ cc_library(
         "//tensorflow/compiler/xla/service",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
diff --git a/tensorflow/contrib/cmake/CMakeLists.txt b/tensorflow/contrib/cmake/CMakeLists.txt
index 0708d6b7b9..e524e9e743 100644
--- a/tensorflow/contrib/cmake/CMakeLists.txt
+++ b/tensorflow/contrib/cmake/CMakeLists.txt
@@ -18,7 +18,16 @@ cmake_policy(SET CMP0022 NEW)
 
 # Options
 option(tensorflow_VERBOSE "Enable for verbose output" OFF)
+
+if(WIN32)
+# BoringSSL is disabled for windows as it currently doesn't build with
+# MSBuild. (Ninja is required.)
 option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" OFF)
+else()
+# BoringSSL is enabled for gRPC.
+option(tensorflow_ENABLE_SSL_SUPPORT "Enable boringssl support" ON)
+endif()
+
 option(tensorflow_ENABLE_GRPC_SUPPORT "Enable gRPC support" ON)
 option(tensorflow_ENABLE_HDFS_SUPPORT "Enable HDFS support" OFF)
 option(tensorflow_ENABLE_JEMALLOC_SUPPORT "Enable jemalloc support" OFF)
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index 693dc7cd67..b1e64aa55c 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -20,6 +20,10 @@ set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
 set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)
 
 if(WIN32)
+  # We use unsecure gRPC because boringssl does not build on windows
+  set(grpc_TARGET grpc++_unsecure)
+  set(grpc_DEPENDS protobuf zlib)
+  set(grpc_SSL_PROVIDER NONE)
   if(${CMAKE_GENERATOR} MATCHES "Visual Studio.*")
     set(grpc_STATIC_LIBRARIES
         ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/Release/grpc++_unsecure.lib
@@ -32,9 +36,12 @@ if(WIN32)
         ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/gpr.lib)
   endif()
 else()
+  set(grpc_TARGET grpc++)
+  set(grpc_DEPENDS boringssl protobuf zlib)
+  set(grpc_SSL_PROVIDER module)
   set(grpc_STATIC_LIBRARIES
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++_unsecure.a
-      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc_unsecure.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc++.a
+      ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgrpc.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libaddress_sorting.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/third_party/cares/cares/lib/libcares.a
       ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/libgpr.a)
@@ -44,13 +51,13 @@ add_definitions(-DGRPC_ARES=0)
 
 ExternalProject_Add(grpc
     PREFIX grpc
-    DEPENDS protobuf zlib
+    DEPENDS ${grpc_DEPENDS}
     GIT_REPOSITORY ${GRPC_URL}
     GIT_TAG ${GRPC_TAG}
     DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
     BUILD_IN_SOURCE 1
     BUILD_BYPRODUCTS ${grpc_STATIC_LIBRARIES}
-    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc++_unsecure
+    BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release --target ${grpc_TARGET}
     COMMAND ${CMAKE_COMMAND} --build . --config Release --target grpc_cpp_plugin
     INSTALL_COMMAND ""
     CMAKE_CACHE_ARGS
@@ -59,7 +66,7 @@ ExternalProject_Add(grpc
         -DPROTOBUF_INCLUDE_DIRS:STRING=${PROTOBUF_INCLUDE_DIRS}
         -DPROTOBUF_LIBRARIES:STRING=${protobuf_STATIC_LIBRARIES}
         -DZLIB_ROOT:STRING=${ZLIB_INSTALL}
-	-DgRPC_SSL_PROVIDER:STRING=NONE
+	-DgRPC_SSL_PROVIDER:STRING=${grpc_SSL_PROVIDER}
 )
 
 # grpc/src/core/ext/census/tracing.c depends on the existence of openssl/rand.h.
diff --git a/tensorflow/contrib/tpu/profiler/BUILD b/tensorflow/contrib/tpu/profiler/BUILD
index dbf1ab6bbf..3b2d7adfff 100644
--- a/tensorflow/contrib/tpu/profiler/BUILD
+++ b/tensorflow/contrib/tpu/profiler/BUILD
@@ -53,7 +53,7 @@ tf_cc_binary(
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/platform/cloud:gcs_file_system",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/contrib/verbs/BUILD b/tensorflow/contrib/verbs/BUILD
index 9720fd6e86..1b45584dcb 100644
--- a/tensorflow/contrib/verbs/BUILD
+++ b/tensorflow/contrib/verbs/BUILD
@@ -58,7 +58,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:async_service_interface",
         "//tensorflow/core/distributed_runtime/rpc:grpc_call",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -69,7 +69,7 @@ cc_library(
     hdrs = ["grpc_verbs_service_impl.h"],
     deps = [
         ":verbs_service_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index 1528c7f130..50f8a307d8 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -42,7 +42,7 @@ load(
 # Check that tensorflow/core:tensorflow does not depend on grpc.
 check_deps(
     name = "core_tensorflow_check_deps",
-    disallowed_deps = ["@grpc//:grpc++_unsecure"],
+    disallowed_deps = ["@grpc//:grpc++"],
     deps = ["//tensorflow/core:tensorflow"],
 )
 
@@ -150,7 +150,7 @@ tf_cuda_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:proto_text",
         "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -170,7 +170,7 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 9032823e17..c6db2aec06 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -649,7 +649,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/kernels:dense_update_ops",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:variable_ops",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -682,7 +682,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/distributed_runtime/rpc:grpc_testlib",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index f3922dde74..dc02d1b9bf 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -65,8 +65,8 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 2eadfcde54..882271e3f5 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -41,8 +41,8 @@ cc_library(
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
     deps = [
-        "@grpc//:grpc_unsecure",
-        "@grpc//:grpc++_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
         "//tensorflow/core:lib",
         # Required to be able to overload TensorResponse parsing.
         "//tensorflow/core/distributed_runtime:tensor_coding",
@@ -56,7 +56,7 @@ cc_library(
     deps = [
         ":grpc_util",
         "//tensorflow/core:lib",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -70,7 +70,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/distributed_runtime:call_options",
         "//tensorflow/core/distributed_runtime:tensor_coding",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -90,7 +90,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:tensor_coding",
         "//tensorflow/core/distributed_runtime:worker_cache_logger",
         "//tensorflow/core/distributed_runtime:worker_interface",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -103,7 +103,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -118,7 +118,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -129,7 +129,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -180,7 +180,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_cache",
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime:worker_session",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -192,7 +192,7 @@ cc_library(
         ":grpc_util",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/distributed_runtime:tensor_coding",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -225,7 +225,7 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:master_proto_cc",
         "//tensorflow/core/distributed_runtime:master",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -236,7 +236,7 @@ cc_library(
     hdrs = ["grpc_master_service_impl.h"],
     deps = [
         "//tensorflow/core:master_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -285,8 +285,8 @@ cc_library(
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/distributed_runtime:session_mgr",
         "//tensorflow/core/distributed_runtime:worker_env",
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
     ],
     alwayslink = 1,
 )
@@ -313,7 +313,7 @@ tf_cc_binary(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/distributed_runtime:server_lib",
         "//tensorflow/core/kernels:data_flow",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -338,7 +338,7 @@ tf_cc_binary(
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:reduction_ops",
         "//tensorflow/core/kernels:variable_ops",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -432,7 +432,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -445,8 +445,8 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:worker_proto_cc",
-        "@grpc//:grpc++_unsecure",
-        "@grpc//:grpc_unsecure",
+        "@grpc",
+        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/BUILD b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
index 1a3bd9d6bf..a5472159cc 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/eager/BUILD
@@ -12,7 +12,7 @@ cc_library(
     hdrs = ["grpc_eager_service.h"],
     deps = [
         "//tensorflow/core:eager_service_proto_cc",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -29,7 +29,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_state",
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_service",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
@@ -48,7 +48,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime/rpc:grpc_util",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_cache",
         "//tensorflow/core/distributed_runtime/rpc:grpc_worker_service",
-        "@grpc//:grpc++_unsecure",
+        "@grpc//:grpc++",
     ],
 )
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 7df3d6594b..b13929e636 100644
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -778,11 +778,9 @@ def tf_workspace(path_prefix="", tf_repo_name=""):
       actual = "@grpc//:grpc_python_plugin",
   )
 
-  # gRPC has three empty C++ functions which it wants the user to define
-  # at build time. https://github.com/grpc/grpc/issues/13590
   native.bind(
       name = "grpc_lib",
-      actual = "@grpc//:grpc++_unsecure",
+      actual = "@grpc//:grpc++",
   )
 
   # Needed by gRPC
-- 
GitLab


From dc7821ccf42ada3f85ca1c6e8228f0a42e61b93c Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 11:26:38 -0700
Subject: [PATCH 608/610] Apply import_scope to asset and variable tensors
 during tf.saved_model.loader.load

This change explicitly declares import_scope as a kwarg for tf.saved_model.loader.load. Previously, tf.saved_model.loader.load implicitly accepted import_scope and passed it through to import_meta_graph through **saver_kwargs.

PiperOrigin-RevId: 200249417
---
 tensorflow/python/saved_model/loader_impl.py  | 22 +++++---
 .../python/saved_model/saved_model_test.py    | 53 +++++++++++++++++++
 tensorflow/python/training/saver.py           |  2 +-
 tensorflow/python/training/saver_test.py      | 40 ++++++++++++++
 .../tensorflow.saved_model.loader.pbtxt       |  2 +-
 5 files changed, 111 insertions(+), 8 deletions(-)

diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index bebf1d5e0d..d1bd8d47ae 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -79,12 +79,14 @@ def _parse_saved_model(export_dir):
                    constants.SAVED_MODEL_FILENAME_PB))
 
 
-def _get_asset_tensors(export_dir, meta_graph_def_to_load):
+def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
   Args:
     export_dir: Directory where the SavedModel is located.
     meta_graph_def_to_load: The meta graph def from the SavedModel to be loaded.
+    import_scope: Optional `string` -- if specified, prepend this followed by
+        '/' to all returned asset tensor names.
 
   Returns:
     A dictionary of asset tensors, keyed by the name of the asset tensor. The
@@ -104,7 +106,10 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load):
     for asset_any_proto in assets_any_proto:
       asset_proto = meta_graph_pb2.AssetFileDef()
       asset_any_proto.Unpack(asset_proto)
-      asset_tensor_dict[asset_proto.tensor_info.name] = os.path.join(
+      tensor_name = asset_proto.tensor_info.name
+      if import_scope:
+        tensor_name = "%s/%s" % (import_scope, tensor_name)
+      asset_tensor_dict[tensor_name] = os.path.join(
           compat.as_bytes(assets_directory),
           compat.as_bytes(asset_proto.filename))
   return asset_tensor_dict
@@ -179,7 +184,7 @@ def maybe_saved_model_directory(export_dir):
 
 
 @tf_export("saved_model.loader.load")
-def load(sess, tags, export_dir, **saver_kwargs):
+def load(sess, tags, export_dir, import_scope=None, **saver_kwargs):
   """Loads the model from a SavedModel as specified by tags.
 
   Args:
@@ -189,6 +194,10 @@ def load(sess, tags, export_dir, **saver_kwargs):
         SavedModel `save()` API.
     export_dir: Directory in which the SavedModel protocol buffer and variables
         to be loaded are located.
+    import_scope: Optional `string` -- if specified, prepend this string
+        followed by '/' to all loaded tensor names. This scope is applied to
+        tensor instances loaded into the passed session, but it is *not* written
+        through to the static `MetaGraphDef` protocol buffer that is returned.
     **saver_kwargs: Optional keyword arguments passed through to Saver.
 
   Returns:
@@ -216,7 +225,8 @@ def load(sess, tags, export_dir, **saver_kwargs):
       )
 
     # Build a saver by importing the meta graph def to load.
-    saver = tf_saver.import_meta_graph(meta_graph_def_to_load, **saver_kwargs)
+    saver = tf_saver.import_meta_graph(
+        meta_graph_def_to_load, import_scope=import_scope, **saver_kwargs)
 
     if saver:
       # Build the checkpoint path where the variables are located.
@@ -232,8 +242,8 @@ def load(sess, tags, export_dir, **saver_kwargs):
                       "checkpoints were restored.")
 
     # Get asset tensors, if any.
-    asset_tensors_dictionary = _get_asset_tensors(export_dir,
-                                                  meta_graph_def_to_load)
+    asset_tensors_dictionary = _get_asset_tensors(
+        export_dir, meta_graph_def_to_load, import_scope=import_scope)
 
     main_op_tensor = (
         _get_main_op_tensor(meta_graph_def_to_load) or
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index effb38283b..fb4732aca2 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -1197,6 +1197,59 @@ class SavedModelTest(test.TestCase):
     _validate_custom_saver("tag_1", "save_1/restore_all")
     _validate_custom_saver("tag_2", "save_2/restore_all")
 
+  def testImportScope(self):
+    export_dir = self._get_export_dir("test_scoped_assets")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    # Build a SavedModel with a variable, an asset, and a constant tensor.
+    with self.test_session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
+                                                      "asset_file_tensor")
+      constant_op.constant("constant value", name="constant_tensor_name")
+      builder.add_meta_graph_and_variables(
+          sess, ["tag_name"], assets_collection=asset_collection)
+
+      # Save the asset file path for later comparison.
+      asset_file_path = asset_collection[0].eval()
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.test_session(graph=ops.Graph()) as sess:
+      # Restore the SavedModel under an import_scope in a new graph/session.
+      graph_proto = loader.load(
+          sess, ["tag_name"], export_dir, import_scope="scope_name")
+
+      # The loaded variable tensor should be scoped, but its contents should be
+      # unchanged.
+      self.assertEqual(
+          "scope_name/v:0",
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].name)
+      self.assertEqual(
+          42,
+          ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
+
+      # The loaded asset tensor should be scoped, but the asset file path and
+      # contents should be unchanged.
+      asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      self.assertEqual(1, len(asset_collection))
+      self.assertEqual(asset_file_path, asset_collection[0].eval())
+      self.assertEqual("scope_name/asset_file_tensor:0",
+                       asset_collection[0].name)
+      # The static asset data inside graph_proto.collection_def should not be
+      # scoped.
+      self._validate_asset_collection(export_dir, graph_proto.collection_def,
+                                      "foo.txt", "content_foo",
+                                      "asset_file_tensor:0")
+
+      # The constant tensor should be scoped, but its contents should be
+      # unchanged.
+      self.assertEqual(
+          compat.as_bytes("constant value"),
+          ops.get_default_graph().get_tensor_by_name(
+              "scope_name/constant_tensor_name:0").eval())
+
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index 4d464135fd..bd2d78b025 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -1970,7 +1970,7 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
 
     return Saver(saver_def=meta_graph_def.saver_def, name=scope)
   else:
-    if variables._all_saveable_objects():  # pylint: disable=protected-access
+    if variables._all_saveable_objects(scope=import_scope):  # pylint: disable=protected-access
       # Return the default saver instance for all graph variables.
       return Saver()
     else:
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index f1991093e0..b228cb85d7 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -2339,6 +2339,46 @@ class MetaGraphTest(test.TestCase):
               10, size=[1, 10])
       })
 
+  def testImportIntoNamescopeWithoutVariables(self):
+    # Save a simple graph that contains no variables into a checkpoint.
+    test_dir = self._get_test_dir("no_vars_graph")
+    filename = os.path.join(test_dir, "ckpt")
+    graph_1 = ops_lib.Graph()
+    with session.Session(graph=graph_1) as sess:
+      constant_op.constant([1, 2, 3], name="x")
+      constant_op.constant([1, 2, 3], name="y")
+      saver = saver_module.Saver(allow_empty=True)
+      saver.save(sess, filename)
+
+    # Create a fresh graph.
+    graph_2 = ops_lib.Graph()
+    with session.Session(graph=graph_2) as sess:
+      # Restore the above checkpoint under scope "subgraph_1".
+      new_saver_1 = saver_module.import_meta_graph(
+          filename + ".meta", graph=graph_2, import_scope="subgraph_1")
+      # There are no variables to restore, so import_meta_graph should not
+      # return a Saver.
+      self.assertIsNone(new_saver_1)
+
+      # Create a variable in graph_2 under scope "my_scope".
+      variables.Variable(array_ops.zeros([10]), name="my_scope/my_var")
+      sess.run(variables.global_variables_initializer())
+      # Restore the checkpoint into a different scope "subgraph_2".
+      new_saver_2 = saver_module.import_meta_graph(
+          filename + ".meta", graph=graph_2, import_scope="subgraph_2")
+      # Because the variable does not live in scope "subgraph_2",
+      # import_meta_graph should not attempt to restore the variable. So,
+      # import_meta_graph still won't return a Saver instance.
+      self.assertIsNone(new_saver_2)
+
+      # However, if we restore the checkpoint under scope "my_scope",
+      # import_meta_graph will detect the variable and return a Saver for
+      # restoring it. This should happen even when the variable does not
+      # originate from graph_1.
+      new_saver_3 = saver_module.import_meta_graph(
+          filename + ".meta", graph=graph_2, import_scope="my_scope")
+      self.assertIsInstance(new_saver_3, saver_module.Saver)
+
   def testImportIntoImplicitNamescope(self):
     # Test that we can import a meta graph into an implicit namescope.
     test_dir = self._get_test_dir("import_into_namescope")
diff --git a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
index 896e2160c6..511e6b4712 100644
--- a/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
+++ b/tensorflow/tools/api/golden/tensorflow.saved_model.loader.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.saved_model.loader"
 tf_module {
   member_method {
     name: "load"
-    argspec: "args=[\'sess\', \'tags\', \'export_dir\'], varargs=None, keywords=saver_kwargs, defaults=None"
+    argspec: "args=[\'sess\', \'tags\', \'export_dir\', \'import_scope\'], varargs=None, keywords=saver_kwargs, defaults=[\'None\'], "
   }
   member_method {
     name: "maybe_saved_model_directory"
-- 
GitLab


From c5436b90adff058500e88b497fc4f7a0b0379d28 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 12 Jun 2018 11:34:51 -0700
Subject: [PATCH 609/610] Support Cloud TPU Pod in GKE environment.

PiperOrigin-RevId: 200251004
---
 .../python/training/tpu_cluster_resolver.py   | 17 +++---
 .../training/tpu_cluster_resolver_test.py     | 54 +++++++++++++++++--
 2 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 3a1d90e77d..8f521ffee4 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -36,6 +36,7 @@ except ImportError:
 
 
 _GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_ENDPOINTS_SEPARATOR = ','
 _DEFAULT_ENV_VARIABLE = 'TPU_NAME'
 _DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
 
@@ -69,8 +70,8 @@ class TPUClusterResolver(ClusterResolver):
     return _GKE_ENV_VARIABLE in os.environ
 
   @staticmethod
-  def _gkeMaster():
-    return os.environ[_GKE_ENV_VARIABLE].split(',')[0]
+  def _gkeEndpoints():
+    return os.environ[_GKE_ENV_VARIABLE]
 
   @staticmethod
   def _envVarFallback():
@@ -143,7 +144,7 @@ class TPUClusterResolver(ClusterResolver):
     # When using GKE with Cloud TPUs, the env variable will be set.
     if tpu is None:
       if in_gke:
-        tpu = self._gkeMaster()
+        tpu = self._gkeEndpoints()
       else:
         tpu = self._envVarFallback()
 
@@ -214,7 +215,7 @@ class TPUClusterResolver(ClusterResolver):
       ValueError: If none of the TPUs specified exists.
     """
     if not self._shouldResolve():
-      return self._tpu
+      return self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
 
     job_tasks = self.cluster_spec().job_tasks(self._job_name)
     if not job_tasks:
@@ -280,8 +281,12 @@ class TPUClusterResolver(ClusterResolver):
         # Case 3.
         return None
       # Case 2.
-      cluster_spec = {self._job_name: [self._tpu[len(
-          compat.as_bytes('grpc://')):]]}
+      cluster_spec = {
+          self._job_name: [
+              x[len(compat.as_bytes('grpc://')):]
+              for x in self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))
+          ]
+      }
 
     if self._coordinator_address:
       # {1, 2}.a
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
index 86e9d9ddad..ad4f643263 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
@@ -402,13 +402,61 @@ class TPUClusterResolverTest(test.TestCase):
         compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
     self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
 
-  def testGkeEnvironment(self):
+  def testGkeEnvironmentForDonut(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
-    self.assertTrue('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS' in os.environ)
+
+    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
+    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+
+    tpu_cluster_resolver = TPUClusterResolver()
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470'),
+        compat.as_bytes(tpu_cluster_resolver.master()))
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.120.27.5:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
+    del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
+
+  def testGkeEnvironmentForPod(self):
+    os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = ('grpc://10.120.27.5:8470,'
+                                                     'grpc://10.120.27.6:8470,'
+                                                     'grpc://10.120.27.7:8470,'
+                                                     'grpc://10.120.27.8:8470')
+
+    self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
     self.assertTrue(TPUClusterResolver._inGke())
+    self.assertEqual(
+        compat.as_bytes('grpc://10.120.27.5:8470,'
+                        'grpc://10.120.27.6:8470,'
+                        'grpc://10.120.27.7:8470,'
+                        'grpc://10.120.27.8:8470'),
+        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+
+    tpu_cluster_resolver = TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeMaster()))
+        compat.as_bytes(tpu_cluster_resolver.master()))
+    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    expected_proto = """
+    job {
+      name: 'worker'
+      tasks { key: 0 value: '10.120.27.5:8470' }
+      tasks { key: 1 value: '10.120.27.6:8470' }
+      tasks { key: 2 value: '10.120.27.7:8470' }
+      tasks { key: 3 value: '10.120.27.8:8470' }
+    }
+    """
+    self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+
     del os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS']
 
   def testDiscoveryUrl(self):
-- 
GitLab


From 400a398a18789da01765950d21f208876b64d30a Mon Sep 17 00:00:00 2001
From: Yong Tang <yong.tang.github@outlook.com>
Date: Tue, 12 Jun 2018 14:40:54 -0700
Subject: [PATCH 610/610] Require same shape for `x` and `y` in shape function
 of `ApproximateEqual` (#19878)

* Require same shape for `x` and `y` in shape function of `ApproximateEqual`

In the kernel implementation of `ApproximateEqual` the shape of inputs
`x` and `y` should be the same. Though in the shape function of `ApproximateEqual`
there was no such validation. This fix adds the shape validation in the
shape function to make sure `x` and `y` are of the same shape, if they are known.

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>

* Add test case for shape function of ApproximateEqual

Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
---
 tensorflow/core/ops/math_ops.cc        | 8 +++++++-
 tensorflow/python/ops/math_ops_test.py | 9 +++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tensorflow/core/ops/math_ops.cc b/tensorflow/core/ops/math_ops.cc
index 929213656c..6d1ef56608 100644
--- a/tensorflow/core/ops/math_ops.cc
+++ b/tensorflow/core/ops/math_ops.cc
@@ -592,7 +592,13 @@ REGISTER_OP("ApproximateEqual")
     .SetIsCommutative()
     .Attr("T: numbertype")
     .Attr("tolerance: float = 0.00001")
-    .SetShapeFn(shape_inference::UnchangedShape);
+    .SetShapeFn([](InferenceContext* c) {
+      // The inputs 'x' and 'y' must have the same shape.
+      ShapeHandle data_x = c->input(0);
+      ShapeHandle data_y = c->input(1);
+      TF_RETURN_IF_ERROR(c->Merge(data_x, data_y, &data_x));
+      return shape_inference::UnchangedShape(c);
+    });
 
 // --------------------------------------------------------------------------
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 980c92b0d5..c807c8bc2e 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -235,6 +235,15 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001))
         self.assertAllEqual(z, z_tf)
 
+  def testApproximateEqualShape(self):
+    for dtype in [np.float32, np.double]:
+      x = np.array([1, 2], dtype=np.float32)
+      y = np.array([[1, 2]], dtype=np.float32)
+      # The inputs 'x' and 'y' must have the same shape.
+      with self.assertRaisesRegexp(
+          ValueError, "Shapes must be equal rank, but are 1 and 2"):
+        math_ops.approximate_equal(x, y)
+
 
 class ScalarMulTest(test_util.TensorFlowTestCase):
 
-- 
GitLab